def plot(): outdir = 'output/protobowl/' pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df = load_protobowl() df.result = df.result.apply(lambda x: x is True) df['log_n_records'] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby('uid') user_stat = df_user_grouped.agg(np.mean) print('{} users'.format(len(user_stat))) print('{} records'.format(len(df))) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = ggplot(user_stat) \ + geom_point(aes(x='relative_position', y='result', size='user_n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + labs(x='Average buzzing position', y='Accuracy') \ + theme(aspect_ratio=1) p0.save(os.path.join(outdir, 'protobowl_users.pdf')) # p0.draw() print('p0 done') # histogram of number of records p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + labs(x='Log number of records', y='Density') \ + theme(aspect_ratio=0.3) p1.save(os.path.join(outdir, 'protobowl_hist.pdf')) # p1.draw() print('p1 done') # histogram of accuracy p2 = ggplot(user_stat, aes(x='result', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density() \ + labs(x='Accuracy', y='Density') \ + theme(aspect_ratio=0.3) p2.save(os.path.join(outdir, 'protobowl_acc.pdf')) # p2.draw() print('p2 done') # histogram of buzzing position p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \ + geom_histogram(color='#3182bd', fill='#deebf7') \ + geom_density() \ + labs(x='Average buzzing position', y='Density') \ + theme(aspect_ratio=0.3) p3.save(os.path.join(outdir, 'protobowl_pos.pdf')) # p3.draw() print('p3 done')
def test_exceptions(): # no x limits with pytest.raises(PlotnineError): p = ggplot(df) print(p + stat_function(fun=np.sin)) # fun not callable with pytest.raises(PlotnineError): p = ggplot(df, aes('x')) print(p + stat_function(fun=1))
def plot_char_percent_vs_accuracy_histogram(self, category=False): if category: return ( ggplot(self.char_plot_df) + facet_wrap('category_jmlr') + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05) ) else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05) )
def plot_char_percent_vs_accuracy_smooth(self, category=False): if category: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='category_jmlr') + geom_smooth() ) else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct') + geom_smooth(method='mavg') )
def plot_compare_accuracy(self, expo=False): if expo: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy', fill='Dataset') + geom_bar(stat='identity', position='dodge') + xlab('Guessing Model') + ylab('Accuracy') ) else: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy') + geom_bar(stat='identity') )
def test_limits(): p = (ggplot(df, aes('x')) + stat_function(fun=np.cos, size=2, color='blue', arrow=arrow(ends='first')) + stat_function(fun=np.cos, xlim=(10, 20), size=2, color='red', arrow=arrow(ends='last'))) assert p == 'limits'
def test_aesthetics(): df = pd.DataFrame({ 'a': range(5), 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9 }) p = (ggplot(df, aes(y='a')) + geom_point(aes(x='b')) + geom_point(aes(x='c', size='a')) + geom_point(aes(x='d', alpha='a'), size=10, show_legend=False) + geom_point(aes(x='e', shape='factor(a)'), size=10, show_legend=False) + geom_point(aes(x='f', color='factor(a)'), size=10, show_legend=False) + geom_point(aes(x='g', fill='a'), stroke=0, size=10, show_legend=False) + geom_point(aes(x='h', stroke='a'), fill='white', color='green', size=10) + geom_point(aes(x='i', shape='factor(a)'), fill='brown', stroke=2, size=10, show_legend=False) + theme(subplots_adjust={'right': 0.85})) assert p == 'aesthetics'
def test_step(): p = (ggplot(df, aes('x')) + geom_step(aes(y='y'), size=4) + geom_step(aes(y='y+2'), color='red', direction='vh', size=4)) assert p == 'step'
def test_expand_limits(): df = pd.DataFrame({'x': range(5, 11), 'y': range(5, 11)}) p = (ggplot(aes('x', 'y'), data=df) + geom_point() + expand_limits(y=(0, None)) ) assert p == 'expand_limits'
def test_bool_mapping(): df = pd.DataFrame({ 'x': [1, 2, 3], 'y': [True, False, False] }) p = ggplot(df, aes('x', 'y')) + geom_point() assert p == 'bool_mapping'
def test_continuous_x(): n = len(df_continuous_x) p = (ggplot(df_continuous_x, aes('x', 'y')) + geom_point() + geom_smooth(df_continuous_x[3:n-3], method='loess', color='blue', fullrange=False)) assert p == 'continuous_x'
def test_legend_fill_ratio(): p = (ggplot(df_linear, aes('x', color='x<0.5')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='lm', size=0.5, span=.3) ) assert p == 'legend_fill_ratio'
def test_normal_with_line(): p = (ggplot(df_normal, aes(sample='x')) + geom_qq() + geom_qq_line() ) # Roughly a straight line of points through the origin assert p == 'normal_with_line'
def my_plot(df, x, y, color=None, clab=None): aes = { 'color': color, 'group': color } if color else {} if clab is None and color is not None: clab = color.replace('pr', "'") return (gg.ggplot(df, gg.aes(x, y, **aes)) + labs(x, y) + (colors(clab) if color else []))
def test_non_linear_smooth_no_ci(): p = (ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='loess', span=.3, color='blue', se=False) ) assert p == 'non_linear_smooth_no_ci'
def test_linear_smooth(): p = (ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='lm', span=.3, color='blue') ) assert p == 'linear_smooth'
def test_summary_functions(): p = (ggplot(df, aes('x', 'y')) + stat_summary(fun_y=np.mean, fun_ymin=np.min, fun_ymax=np.max, size=2)) assert p == 'summary_functions'
def test_discrete_x(): p = (ggplot(df, aes('xd', 'y')) + stat_summary_bin(fun_y=np.mean, fun_ymin=np.min, fun_ymax=np.max, geom='bar')) assert p == 'discrete_x'
def test_hull(): p = (ggplot(mtcars) + aes('wt', 'mpg', color='factor(cyl)') + geom_point() + stat_hull(size=1) ) assert p + _theme == 'hull'
def test_aes_inheritance(): # A default line (intercept = 0, slope = 1) p = (ggplot(df, aes('x', 'y', color='factor(z)', slope='slope', intercept='intercept')) + geom_point(size=10, show_legend=False) + geom_abline(size=2)) assert p == 'aes_inheritance'
def test_ribbon_facetting(): p = (ggplot(df, aes('x', ymin='ymin', ymax='ymax', fill='factor(z)')) + geom_ribbon() + facet_wrap('~ z') ) assert p + _theme == 'ribbon_facetting'
def test_ellipse(): p = (ggplot(df, aes('x', 'y')) + geom_point() + stat_ellipse(type='t') + stat_ellipse(type='norm', color='red') + stat_ellipse(type='euclid', color='blue') ) assert p == 'ellipse'
def test_continuous_x(): p = (ggplot(df, aes('xc', 'y')) + stat_summary_bin(fun_y=np.mean, fun_ymin=np.min, fun_ymax=np.max, bins=5, geom='bar')) assert p == 'continuous_x'
def test_funargs(): p = (ggplot(df, aes('x', 'y')) + stat_summary(fun_data='mean_cl_normal', size=2, color='blue') + stat_summary(fun_data='mean_cl_normal', fun_args={'confidence_interval': .5}, size=2, color='green')) assert p == 'fun_args'
def test_aesthetics(): p = (ggplot(df, aes(y='y', angle='angle', radius='radius')) + geom_spoke(aes('x'), size=2) + geom_spoke(aes('x+2', alpha='z'), size=2) + geom_spoke(aes('x+4', linetype='factor(z)'), size=2) + geom_spoke(aes('x+6', color='factor(z)'), size=2) + geom_spoke(aes('x+8', size='z'))) assert p + _theme == 'aesthetics'
def test_arrow(): p = (ggplot(df, aes('x', 'y')) + geom_path(size=2, arrow=arrow(ends='both', type='closed')) + geom_path(aes(y='y+2'), color='red', size=2, arrow=arrow(angle=60, length=1, ends='first')) + geom_path(aes(y='y+4'), color='blue', size=2, arrow=arrow(length=1))) assert p == 'arrow'
def test_scale_without_a_mapping(): df = pd.DataFrame({ 'x': [1, 2, 3], }) p = (ggplot(df, aes('x', 'x')) + geom_point() + scale_color.scale_color_continuous()) with pytest.warns(UserWarning): p.draw_test()
def test_changing_xlim_in_stat_density(): n = 100 _xlim = (5, 10) df = pd.DataFrame({'x': np.linspace(_xlim[0]-1, _xlim[1]+1, n)}) p = (ggplot(df, aes('x')) + stat_density() + xlim(*_xlim) ) # No exceptions p._build()
def test_errorbarh_aesthetics(): p = (ggplot(df, aes(xmin='ymin', xmax='ymax')) + geom_errorbarh(aes(y='x'), size=2) + geom_errorbarh(aes(y='x+1', alpha='z'), height=0.2, size=2) + geom_errorbarh(aes(y='x+2', linetype='factor(z)'), size=2) + geom_errorbarh(aes(y='x+3', color='factor(z)'), size=2) + geom_errorbarh(aes(y='x+4', size='z')) ) assert p + _theme == 'errorbarh_aesthetics'
def test_lines(): p = (ggplot(df, aes(x='x', y='y')) + geom_point(alpha=.5) + geom_quantile(quantiles=[.001, .5, .999], formula='y~x', size=2)) # Two (.001, .999) quantile lines should bound the points # from below and from above, and the .5 line should go # through middle (approximately). assert p == 'lines'
def test_no_fill(): p = (ggplot(df, aes('x', group='factor(z)')) + geom_polygon(aes(y='y'), fill=None, color='red', size=2) + geom_polygon(aes(y='y+2'), fill='None', color='green', size=2) + geom_polygon(aes(y='y+4'), fill='none', color='blue', size=2)) assert p + _theme == 'no_fill'
#Topic: Common Plots #----------------------------- #libraries https://pythonplot.com/#bar-count import numpy as np import pandas as pd import matplotlib.pyplot as plt #pip install plotnine #similar to ggplots #https://plotnine.readthedocs.io/en/stable/index.html import plotnine #ggplot type from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap plotnine.facet_wrap? from plotnine.data import mtcars (ggplot(mtcars, aes('wt', 'mpg', color='factor(gear)')) + geom_point() + stat_smooth(method='lm') + facet_wrap('~gear')) from plotnine import * (ggplot(mtcars, aes('factor(cyl)', fill='factor(am)')) + geom_bar( position='fill') ) (ggplot(mtcars, aes('factor(cyl)', fill='factor(am)')) + geom_bar(position='fill') + geom_text(aes(label='stat(count)'), stat='count', position='fill' )) (ggplot(mpg)+ aes(x='manufacturer') + geom_bar(size=20) + coord_flip() + labs(y='Count', x='Manufacturer', title='Number of Cars by Make')) #https://plotnine.readthedocs.io/en/stable/tutorials/miscellaneous-order-plot-series.html from pydataset import data data() mtcars = data('mtcars') data1 = mtcars.copy() data1.head()
def analyze_index(index_array, mask, histplot=False, bins=100): """This extracts the hyperspectral index statistics and writes the values as observations out to the Outputs class. Inputs: index_array = Instance of the Spectral_data class, usually the output from pcv.hyperspectral.extract_index mask = Binary mask made from selected contours histplot = if True plots histogram of intensity values bins = optional, number of classes to divide spectrum into :param array: __main__.Spectral_data :param mask: numpy array :param histplot: bool :param bins: int """ params.device += 1 debug = params.debug params.debug = None if len(np.shape(mask)) > 2 or len(np.unique(mask)) > 2: fatal_error("Mask should be a binary image of 0 and nonzero values.") if len(np.shape(index_array.array_data)) > 2: fatal_error("index_array data should be a grayscale image.") # Mask data and collect statistics about pixels within the masked image masked_array = index_array.array_data[np.where(mask > 0)] index_mean = np.average(masked_array) index_median = np.median(masked_array) index_std = np.std(masked_array) # Calculate histogram maxval = round(np.amax(index_array.array_data[0]), 4) hist_nir = [float(l[0]) for l in cv2.calcHist([index_array.array_data.astype(np.float32)], [0], mask, [bins], [-2, 2])] # Create list of bin labels bin_width = maxval / float(bins) b = 0 bin_labels = [float(b)] plotting_labels = [float(b)] for i in range(bins - 1): b += bin_width bin_labels.append(b) plotting_labels.append(round(b, 2)) # Make hist percentage for plotting pixels = cv2.countNonZero(mask) hist_percent = [(p / float(pixels)) * 100 for p in hist_nir] # Reset debug mode and make plot params.debug = debug if histplot is True: hist_x = hist_percent dataset = pd.DataFrame({'Index Reflectance': bin_labels, 'Proportion of pixels (%)': hist_x}) fig_hist = (ggplot(data=dataset, mapping=aes(x='Index Reflectance', y='Proportion of pixels (%)')) + geom_line(color='red') + scale_x_continuous(breaks=plotting_labels, labels=plotting_labels)) analysis_image = fig_hist if params.debug == "print": fig_hist.save(os.path.join(params.debug_outdir, str(params.device) + index_array.array_type + '_hist.png')) elif params.debug == "plot": print(fig_hist) # Make sure variable names should be unique within a workflow outputs.add_observation(variable='mean_' + index_array.array_type, trait='Average ' + index_array.array_type + ' reflectance', method='plantcv.plantcv.hyperspectral.analyze_index', scale='reflectance', datatype=float, value=float(index_mean), label='none') outputs.add_observation(variable='med_' + index_array.array_type, trait='Median ' + index_array.array_type + ' reflectance', method='plantcv.plantcv.hyperspectral.analyze_index', scale='reflectance', datatype=float, value=float(index_median), label='none') outputs.add_observation(variable='std_' + index_array.array_type, trait='Standard deviation ' + index_array.array_type + ' reflectance', method='plantcv.plantcv.hyperspectral.analyze_index', scale='reflectance', datatype=float, value=float(index_std), label='none') outputs.add_observation(variable='index_frequencies_' + index_array.array_type, trait='index frequencies', method='plantcv.plantcv.analyze_nir_intensity', scale='frequency', datatype=list, value=hist_percent, label=bin_labels) if params.debug == "plot": plot_image(masked_array) elif params.debug == "print": img_name = str(params.device) + index_array.array_type + ".png" print_image(img=masked_array, filename=os.path.join(params.debug_outdir, img_name))
def histogram(gray_img, mask=None, bins=256, color='red', title=None): """Plot a histogram using ggplot. Inputs: gray_img = grayscale image to analyze mask = binary mask made from selected contours bins = number of classes to divide spectrum into color = color of the line drawn title = custom title for the plot gets drawn if title is not None :param gray_img: numpy.ndarray :param mask: numpy.ndarray :param bins: int :param color: str :param title: str :return fig_hist: ggplot """ params.device += 1 debug = params.debug # Apply mask if one is supplied if mask is not None: # apply plant shaped mask to image params.debug = None mask1 = binary_threshold(mask, 0, 255, 'light') mask1 = (mask1 / 255) masked = np.multiply(gray_img, mask1) else: masked = gray_img params.debug = debug if gray_img.dtype == 'uint16': maxval = 65536 else: maxval = 256 # Store histogram data hist_gray_data, hist_bins = np.histogram(masked, bins, (1, maxval)) hist_bins1 = hist_bins[:-1] hist_bins2 = [l for l in hist_bins1] hist_gray = [l for l in hist_gray_data] # make hist percentage for plotting pixels = cv2.countNonZero(masked) hist_percent = (hist_gray_data / float(pixels)) * 100 hist_x = hist_percent bin_labels = np.arange(0, bins) dataset = pd.DataFrame({ 'Grayscale pixel intensity': bin_labels, 'Proportion of pixels (%)': hist_x }) if title is None: fig_hist = (ggplot(data=dataset, mapping=aes(x='Grayscale pixel intensity', y='Proportion of pixels (%)')) + geom_line(color=color) + scale_x_continuous(breaks=list(range(0, bins, 25)))) elif title is not None: fig_hist = (ggplot(data=dataset, mapping=aes(x='Grayscale pixel intensity', y='Proportion of pixels (%)')) + geom_line(color=color) + scale_x_continuous(breaks=list(range(0, bins, 25))) + labels.ggtitle(title)) if params.debug is not None: if params.debug == "print": fig_hist.save( os.path.join(params.debug_outdir, str(params.device) + '_hist.png')) if params.debug == "plot": print(fig_hist) return fig_hist
def test_nudge(): p = (ggplot(df1, aes('x', 'y')) + geom_point(size=10) + geom_point(size=10, color='red', position=position_nudge(.25, .25))) assert p + _theme == 'nudge'
from plotnine.data import economics from plotnine import ggplot, aes, geom_line, labs g = (ggplot(economics) + aes(x="date", y="uempmed") + geom_line() + labs(x="date", y="median duration of unemployment")) g.save("08.png")
def test_nonzero_indexed_data(): df = pd.DataFrame({98: {'blip': 0, 'blop': 1}, 99: {'blip': 1, 'blop': 3}}).T p = ggplot(aes(x='blip', y='blop'), data=df) + geom_line() p.draw_test()
def gg_f_plot(f, X): if len(np.shape(X)) == 1: X = np.array(X)[:, None] Y = np.vectorize(f)(X) df = pd.DataFrame({'X': X[:, 0], 'Y': Y[:, 0]}) return ggplot(df) + geom_line(aes('X', 'Y'), color='blue')
output = output.assign(dummy_y = 0) output print(output) # printing the result table # Perform a t-test to determine if weights are significantly different targene_geo_mutant = output[output['status_sign'] == 1] targene_geo_wt = output[output['status_sign'] == -1] # Output t-test results t_results_geo_targene = ttest_ind(a = targene_geo_mutant['weight'], b = targene_geo_wt['weight'], equal_var = False) print('Statistic = {:.2f}, p = {:.2E}'.format(t_results_geo_targene[0], Decimal(t_results_geo_targene[1]))) # graphical output for predictions p = (gg.ggplot(output, gg.aes(x='weight', y='dummy_y', color='factor(status_sign)')) + gg.geom_hline(gg.aes(yintercept=0), linetype='solid') + gg.geom_point(size=4) + gg.scale_color_manual(values=["#377eb8", "#ff7f00"], labels=['WT', 'Mutant']) + gg.ylim([-0.1, 0.1]) + gg.xlim([-0.001, 1.001]) + gg.theme_seaborn(style='whitegrid') + gg.xlab('Targene Classifier Score') + gg.ylab('') + gg.labs(color='Sample_status') + gg.ggtitle('Mutant vs WT \n') + gg.theme( plot_title=gg.element_text(size=22), axis_title_x=gg.element_text(size=16), axis_text_x=gg.element_text(size=16), axis_text_y=gg.element_blank(),
def gg_plot(X, Y): if len(np.shape(X)) == 1: X = np.array(X)[:, None] if len(np.shape(Y)) == 1: Y = np.array(Y)[:, None] df = pd.DataFrame({'X': X[:, 0], 'Y': Y[:, 0]}) return ggplot(df) + geom_line(aes('X', 'Y'), color='blue')
def create_plots(df: pd.DataFrame) -> List[p9.ggplot]: plots = [p9.ggplot(df) + p9.geom_bar(p9.aes(x='Ewbanks Grade'))] return plots
from plotnine.data import mpg from plotnine import ggplot, aes, facet_grid, labs, geom_point, stat_smooth g = (ggplot(mpg) + facet_grid(facets="year~class") + aes(x="displ", y="hwy") + labs(x="Engine Size", y="Miles per Gallon", title="Miles per Gallon for Each Year and Vehicle Class") + geom_point()) g.save("16.png")
def analyze_nir_intensity(gray_img, mask, bins=256, histplot=False): """This function calculates the intensity of each pixel associated with the plant and writes the values out to a file. It can also print out a histogram plot of pixel intensity and a pseudocolor image of the plant. Inputs: gray_img = 8- or 16-bit grayscale image data mask = Binary mask made from selected contours bins = number of classes to divide spectrum into histplot = if True plots histogram of intensity values Returns: analysis_images = NIR histogram image :param gray_img: numpy array :param mask: numpy array :param bins: int :param histplot: bool :return analysis_images: plotnine ggplot """ # apply plant shaped mask to image mask1 = binary_threshold(mask, 0, 255, 'light') mask1 = (mask1 / 255) # masked = np.multiply(gray_img, mask1) # calculate histogram if gray_img.dtype == 'uint16': maxval = 65536 else: maxval = 256 # Make a pseudo-RGB image rgbimg = cv2.cvtColor(gray_img, cv2.COLOR_GRAY2BGR) # Calculate histogram hist_nir = [ float(l[0]) for l in cv2.calcHist([gray_img], [0], mask, [bins], [0, maxval]) ] # Create list of bin labels bin_width = maxval / float(bins) b = 0 bin_labels = [float(b)] for i in range(bins - 1): b += bin_width bin_labels.append(b) # make hist percentage for plotting pixels = cv2.countNonZero(mask1) hist_percent = [(p / float(pixels)) * 100 for p in hist_nir] masked1 = cv2.bitwise_and(rgbimg, rgbimg, mask=mask) if params.debug is not None: params.device += 1 if params.debug == "print": print_image( masked1, os.path.join(params.debug_outdir, str(params.device) + "_masked_nir_plant.png")) if params.debug == "plot": plot_image(masked1) analysis_image = None if histplot is True: hist_x = hist_percent # bin_labels = np.arange(0, bins) dataset = pd.DataFrame({ 'Grayscale pixel intensity': bin_labels, 'Proportion of pixels (%)': hist_x }) fig_hist = (ggplot(data=dataset, mapping=aes(x='Grayscale pixel intensity', y='Proportion of pixels (%)')) + geom_line(color='red') + scale_x_continuous(breaks=list(range(0, maxval, 25)))) analysis_image = fig_hist if params.debug == "print": fig_hist.save( os.path.join(params.debug_outdir, str(params.device) + '_nir_hist.png')) elif params.debug == "plot": print(fig_hist) outputs.add_observation(variable='nir_frequencies', trait='near-infrared frequencies', method='plantcv.plantcv.analyze_nir_intensity', scale='frequency', datatype=list, value=hist_nir, label=bin_labels) # Store images outputs.images.append(analysis_image) return analysis_image
sample_sub.head() train.shape test.shape # 거의 비슷한 숫자 sample_sub.shape # test set에 있는 실제 인원 수는 95천명 train.isnull().sum() test.isnull().sum() # Train과 Test 모두 Null 존재 ########## EDA #### TripType train["TripType"] = train["TripType"].apply(lambda x: -1 if x == 999 else x) # 999코드를 -1로 변경 p9.ggplot(train, p9.aes(x='TripType', fill='TripType')) + p9.geom_bar() # 39와 40이 상당히 많음. 특히 40이 엄청 많음. train["TripType"].value_counts() train.info() #### Weekday p9.ggplot(train, p9.aes(x='Weekday', fill='Weekday')) + p9.geom_bar() # 역시 토일이 쇼핑이 많음 #### Upc len(train['Upc'].unique()) # Upc Number에는 중복값이 많군. train[train['Upc'].isnull()] # Upc가 Missing이면 DepartmentDescription, FinelineNumber 모두 Missing. # 특히, FinelineNumber의 Missing은 Upc와 동일. 두 변수가 밀접한 관계가 있음을 알 수 있음.
"--input-file", type=argparse.FileType("r"), required=True, help="the benchmark output file to load", ) parser.add_argument("--output-file", type=str, required=True, help="the name for the output plot") args = parser.parse_args() # %% data = load_benchmark_output(args.input_file) # %% print(data.head()) # %% print(data.describe()) # %% (p9.ggplot( data=data[(data.Op != "CREATE") & (data.Op != "DELETE")], mapping=p9.aes(x="Op", y="MiBs", color="Api"), ) + p9.facet_wrap(facets="Op", labeller="label_both", scales="free") + p9.geom_boxplot()).save(args.output_file) # %% compare_api(data, "READ") compare_api(data, "WRITE")
def analyze_color(rgb_img, mask, bins, hist_plot_type=None): """Analyze the color properties of an image object Inputs: rgb_img = RGB image data mask = Binary mask made from selected contours bins = number of color bins the channel is divided into hist_plot_type = 'None', 'all', 'rgb','lab' or 'hsv' Returns: hist_header = color histogram data table headers hist_data = color histogram data table values analysis_image = histogram output :param rgb_img: numpy.ndarray :param mask: numpy.ndarray :param bins: int :param hist_plot_type: str :return hist_header: list :return hist_data: list :return analysis_images: list """ params.device += 1 if len(np.shape(rgb_img)) < 3: fatal_error("rgb_img must be an RGB image") masked = cv2.bitwise_and(rgb_img, rgb_img, mask=mask) b, g, r = cv2.split(masked) lab = cv2.cvtColor(masked, cv2.COLOR_BGR2LAB) l, m, y = cv2.split(lab) hsv = cv2.cvtColor(masked, cv2.COLOR_BGR2HSV) h, s, v = cv2.split(hsv) # Color channel dictionary norm_channels = {"b": np.divide(b, (256 / bins)).astype(np.uint8), "g": np.divide(g, (256 / bins)).astype(np.uint8), "r": np.divide(r, (256 / bins)).astype(np.uint8), "l": np.divide(l, (256 / bins)).astype(np.uint8), "m": np.divide(m, (256 / bins)).astype(np.uint8), "y": np.divide(y, (256 / bins)).astype(np.uint8), "h": np.divide(h, (256 / bins)).astype(np.uint8), "s": np.divide(s, (256 / bins)).astype(np.uint8), "v": np.divide(v, (256 / bins)).astype(np.uint8) } # Histogram plot types hist_types = {"all": ("b", "g", "r", "l", "m", "y", "h", "s", "v"), "rgb": ("b", "g", "r"), "lab": ("l", "m", "y"), "hsv": ("h", "s", "v")} # # If the user-input pseudo_channel is not None and is not found in the list of accepted channels, exit # if pseudo_channel is not None and pseudo_channel not in norm_channels: # fatal_error("Pseudocolor channel was " + str(pseudo_channel) + # ', but can only be one of the following: None, "l", "m", "y", "h", "s" or "v"!') # # If the user-input pseudocolored image background is not in the accepted input list, exit # if pseudo_bkg not in ["white", "img", "both"]: # fatal_error("The pseudocolored image background was " + str(pseudo_bkg) + # ', but can only be one of the following: "white", "img", or "both"!') # # If the user-input histogram color-channel plot type is not in the list of accepted channels, exit if hist_plot_type is not None and hist_plot_type not in hist_types: fatal_error("The histogram plot type was " + str(hist_plot_type) + ', but can only be one of the following: None, "all", "rgb", "lab", or "hsv"!') histograms = { "b": {"label": "blue", "graph_color": "blue", "hist": cv2.calcHist([norm_channels["b"]], [0], mask, [bins], [0, (bins - 1)])}, "g": {"label": "green", "graph_color": "forestgreen", "hist": cv2.calcHist([norm_channels["g"]], [0], mask, [bins], [0, (bins - 1)])}, "r": {"label": "red", "graph_color": "red", "hist": cv2.calcHist([norm_channels["r"]], [0], mask, [bins], [0, (bins - 1)])}, "l": {"label": "lightness", "graph_color": "dimgray", "hist": cv2.calcHist([norm_channels["l"]], [0], mask, [bins], [0, (bins - 1)])}, "m": {"label": "green-magenta", "graph_color": "magenta", "hist": cv2.calcHist([norm_channels["m"]], [0], mask, [bins], [0, (bins - 1)])}, "y": {"label": "blue-yellow", "graph_color": "yellow", "hist": cv2.calcHist([norm_channels["y"]], [0], mask, [bins], [0, (bins - 1)])}, "h": {"label": "hue", "graph_color": "blueviolet", "hist": cv2.calcHist([norm_channels["h"]], [0], mask, [bins], [0, (bins - 1)])}, "s": {"label": "saturation", "graph_color": "cyan", "hist": cv2.calcHist([norm_channels["s"]], [0], mask, [bins], [0, (bins - 1)])}, "v": {"label": "value", "graph_color": "orange", "hist": cv2.calcHist([norm_channels["v"]], [0], mask, [bins], [0, (bins - 1)])} } hist_data_b = [l[0] for l in histograms["b"]["hist"]] hist_data_g = [l[0] for l in histograms["g"]["hist"]] hist_data_r = [l[0] for l in histograms["r"]["hist"]] hist_data_l = [l[0] for l in histograms["l"]["hist"]] hist_data_m = [l[0] for l in histograms["m"]["hist"]] hist_data_y = [l[0] for l in histograms["y"]["hist"]] hist_data_h = [l[0] for l in histograms["h"]["hist"]] hist_data_s = [l[0] for l in histograms["s"]["hist"]] hist_data_v = [l[0] for l in histograms["v"]["hist"]] binval = np.arange(0, bins) bin_values = [l for l in binval] # Store Color Histogram Data hist_header = [ 'HEADER_HISTOGRAM', 'bin-number', 'bin-values', 'blue', 'green', 'red', 'lightness', 'green-magenta', 'blue-yellow', 'hue', 'saturation', 'value' ] hist_data = [ 'HISTOGRAM_DATA', bins, bin_values, hist_data_b, hist_data_g, hist_data_r, hist_data_l, hist_data_m, hist_data_y, hist_data_h, hist_data_s, hist_data_v ] analysis_images = [] dataset = pd.DataFrame({'bins': binval, 'blue': hist_data_b, 'green': hist_data_g, 'red': hist_data_r, 'lightness': hist_data_l, 'green-magenta': hist_data_m, 'blue-yellow': hist_data_y, 'hue': hist_data_h, 'saturation': hist_data_s, 'value': hist_data_v}) # Make the histogram figure using plotnine if hist_plot_type is not None: if hist_plot_type == 'rgb': df_rgb = pd.melt(dataset, id_vars=['bins'], value_vars=['blue', 'green', 'red'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot(df_rgb, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, bins, 25))) + scale_color_manual(['blue', 'green', 'red']) ) analysis_images.append(hist_fig) elif hist_plot_type == 'lab': df_lab = pd.melt(dataset, id_vars=['bins'], value_vars=['lightness', 'green-magenta', 'blue-yellow'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot(df_lab, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, bins, 25))) + scale_color_manual(['yellow', 'magenta', 'dimgray']) ) analysis_images.append(hist_fig) elif hist_plot_type == 'hsv': df_hsv = pd.melt(dataset, id_vars=['bins'], value_vars=['hue', 'saturation', 'value'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot(df_hsv, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, bins, 25))) + scale_color_manual(['blueviolet', 'cyan', 'orange']) ) analysis_images.append(hist_fig) elif hist_plot_type == 'all': s = pd.Series(['blue', 'green', 'red', 'lightness', 'green-magenta', 'blue-yellow', 'hue', 'saturation', 'value'], dtype="category") color_channels = ['blue', 'yellow', 'green', 'magenta', 'blueviolet', 'dimgray', 'red', 'cyan', 'orange'] df_all = pd.melt(dataset, id_vars=['bins'], value_vars=s, var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot(df_all, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, bins, 25))) + scale_color_manual(color_channels) ) analysis_images.append(hist_fig) # Store into global measurements if not 'color_histogram' in outputs.measurements: outputs.measurements['color_histogram'] = {} outputs.measurements['color_histogram']['bin-number'] = bins outputs.measurements['color_histogram']['bin-values'] = bin_values outputs.measurements['color_histogram']['blue'] = hist_data_b outputs.measurements['color_histogram']['green'] = hist_data_g outputs.measurements['color_histogram']['red'] = hist_data_r outputs.measurements['color_histogram']['lightness'] = hist_data_l outputs.measurements['color_histogram']['green-magenta'] = hist_data_m outputs.measurements['color_histogram']['blue-yellow'] = hist_data_y outputs.measurements['color_histogram']['hue'] = hist_data_h outputs.measurements['color_histogram']['saturation'] = hist_data_s outputs.measurements['color_histogram']['value'] = hist_data_v # Store images outputs.images.append(analysis_images) return hist_header, hist_data, analysis_images
def quick_color_check(target_matrix, source_matrix, num_chips): """ Quickly plot target matrix values against source matrix values to determine over saturated color chips or other issues. Inputs: source_matrix = an nrowsXncols matrix containing the avg red, green, and blue values for each color chip of the source image target_matrix = an nrowsXncols matrix containing the avg red, green, and blue values for each color chip of the target image num_chips = number of color card chips included in the matrices (integer) :param source_matrix: numpy.ndarray :param target_matrix: numpy.ndarray :param num_chips: int """ # Imports from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \ scale_y_continuous, scale_color_manual, aes import pandas as pd # Extract and organize matrix info tr = target_matrix[:num_chips, 1:2] tg = target_matrix[:num_chips, 2:3] tb = target_matrix[:num_chips, 3:4] sr = source_matrix[:num_chips, 1:2] sg = source_matrix[:num_chips, 2:3] sb = source_matrix[:num_chips, 3:4] # Create columns of color labels red = [] blue = [] green = [] for i in range(num_chips): red.append('red') blue.append('blue') green.append('green') # Make a column of chip numbers chip = np.arange(0, num_chips).reshape((num_chips, 1)) chips = np.row_stack((chip, chip, chip)) # Combine info color_data_r = np.column_stack((sr, tr, red)) color_data_g = np.column_stack((sg, tg, green)) color_data_b = np.column_stack((sb, tb, blue)) all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r)) # Create a dataframe with headers dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1], 'color': all_color_data[:, 2]}) # Add chip numbers to the dataframe dataset['chip'] = chips dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float}) # Make the plot p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \ geom_point(show_legend=False, size=2) + \ geom_smooth(method='lm', size=.5, show_legend=False) + \ theme_seaborn() + facet_grid('.~color') + \ geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \ scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \ scale_color_manual(values=['blue', 'green', 'red']) # Autoincrement the device counter params.device += 1 # Reset debug if params.debug is not None: if params.debug == 'print': p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png'), verbose=False) elif params.debug == 'plot': print(p1)
from plotnine import ggplot, aes, geom_point, facet_grid, facet_wrap from plotnine import geom_abline, annotate from plotnine.data import mpg from plotnine.exceptions import PlotnineWarning n = 10 df = pd.DataFrame({ 'x': range(n), 'y': range(n), 'var1': np.repeat(range(n // 2), 2), 'var2': np.tile(['a', 'b'], n // 2), }) df['class'] = df['var1'] # python keyword as column df['g'] = df['var1'] # variable as a column g = (ggplot(df, aes('x', 'y')) + geom_point(aes(color='factor(var1)'), size=5, show_legend=False)) # facet_wrap def test_facet_wrap_one_var(): p = g + facet_wrap('~var1') p2 = g + facet_wrap('~class') # python keyword in formula p3 = g + facet_wrap('~g') # variable in formula assert p == 'facet_wrap_one_var' assert p2 == 'facet_wrap_one_var' assert p3 == 'facet_wrap_one_var' # https://github.com/pandas-dev/pandas/issues/16276
def test_dodge_preserve_single(): df1 = pd.DataFrame({'x': ['a', 'b', 'b'], 'y': ['a', 'a', 'b']}) p = (ggplot(df1, aes('x', fill='y')) + geom_bar(position=position_dodge(preserve='single'))) assert p + _theme == 'dodge_preserve_single'
class TestThemes: g = (ggplot(mtcars, aes(x='wt', y='mpg', color='factor(gear)')) + geom_point() + facet_grid('vs ~ am')) def test_theme_538(self): p = self.g + labs(title='Theme 538') + theme_538() assert p + _theme == 'theme_538' def test_theme_bw(self): p = self.g + labs(title='Theme BW') + theme_bw() assert p + _theme == 'theme_bw' def test_theme_classic(self): p = self.g + labs(title='Theme Classic') + theme_classic() assert p + _theme == 'theme_classic' def test_theme_dark(self): p = self.g + labs(title='Theme Dark') + theme_dark() assert p + _theme == 'theme_dark' def test_theme_gray(self): p = self.g + labs(title='Theme Gray') + theme_gray() assert p + _theme == 'theme_gray' def test_theme_light(self): p = self.g + labs(title='Theme Light') + theme_light() assert p + _theme == 'theme_light' def test_theme_linedraw(self): p = self.g + labs(title='Theme Linedraw') + theme_linedraw() assert p + _theme == 'theme_linedraw' def test_theme_matplotlib(self): p = self.g + labs(title='Theme Matplotlib') + theme_matplotlib() assert p + _theme == 'theme_matplotlib' def test_theme_minimal(self): p = self.g + labs(title='Theme Minimal') + theme_minimal() assert p + _theme == 'theme_minimal' def test_theme_seaborn(self): p = self.g + labs(title='Theme Seaborn') + theme_seaborn() assert p + _theme == 'theme_seaborn' def test_theme_void(self): p = self.g + labs(title='Theme Void') + theme_void() assert p + _theme == 'theme_void' def test_theme_xkcd(self): p = self.g + labs(title='Theme Xkcd') + theme_xkcd() if os.environ.get('TRAVIS'): # Travis does not have the fonts, we still check # to catch any other errors assert p + _theme != 'theme_gray' else: assert p + _theme == 'theme_xkcd'
def plot_bar(data,nuclstr,column='value',factor=None,ymin=None,ymax=None,stat='identity',dpi=300,features=None,feature_types=['all'],add_features=[],funcgroups=None,shading_modes=['charge_functional'],usd=False,right_overhang_fix=None,debug=False,startnumber=1,cropseq=(0,None),aspect_ratio=None,reverse_seq=False,double_seq=False,transparent=True,fill_params=None,bar_position='stack',title=None): """ A wrapper function to make a plot of data with bars along the sequnce input should be a dataframe with resid, segid column and 'value' This one is inspired by seqplot/seqplot/pdb_plot.py """ segid=data['segid'].values[0] if title is None: title="Segid: %s, Type: %s"%(segid,nuclstr.components[segid]['type']) seq=Seq(str(nuclstr.seqs[segid]['fullseq']),generic_protein \ if nuclstr.components[segid]['entity'] is 'DNA' or 'histone' or 'protein' else generic_dna) msar=MultipleSeqAlignment([SeqRecord(seq=seq,id=nuclstr.components[segid]['type']+':'+segid,\ name=nuclstr.components[segid]['type']+':'+segid)]) if(reverse_seq): logger.info("Experimental feature will reverse the sequence") msar[0].seq=msar[0].seq[::-1] if double_seq: msar.add_sequence('reverse',str(msar[0].seq[::-1])) msar=msar[:,cropseq[0]:cropseq[1]] # print("Seq to plot:",msar) #We need to get starting residue, currently for DNA chains only cifseq gets it correctly resid_start=nuclstr.seqs[segid]['resid_start'] logger.debug("Starting resid",resid_start) overhang=nuclstr.seqs[segid]['overhangL'] datafixed=data.copy() datafixed.loc[:,'resid']=datafixed.loc[:,'resid']-resid_start+overhang+1-cropseq[0] sl=len(msar[0].seq) # fn=shade.seqfeat2shadefeat(msar,feature_types=feature_types,force_feature_pos='bottom',debug=debug) if features is None: fn=nuclstr.shading_features[segid] else: fn=features fn2=[] for i in fn: if (i['style'] in feature_types) or ('all' in feature_types) : fn2.append(i) fn2.extend(add_features) if usd: ruler='top' else: ruler=None shaded=ipyshade.shadedmsa4plot(msar,features=fn2,shading_modes=shading_modes,debug=debug,startnumber=startnumber,setends=[startnumber-2,sl+startnumber+2],funcgroups=funcgroups,ruler=ruler,density=200) #If sl%10=10 se will have a ruler number hanging beyond the sequence image, and we need to correct for that. if right_overhang_fix is None: if sl%10==0: if sl<100: rof= 0.1 else: rof=0.5 else: rof=0 else: rof=right_overhang_fix if (not aspect_ratio is None ): ar=aspect_ratio else: ar=0.2*100./sl # print(datafixed) plot=(ggplot(data=datafixed,mapping=aes(x='resid', y=column)) # + geom_point(size=0.1) # +geom_bar(stat='identity',width=0.5,mapping=aes(fill=factor)) + scale_x_continuous(limits=(0.5,sl+0.5+rof),expand=(0,0.2),name='',breaks=[]) # + scale_y_continuous(breaks=[0,0.5,1.0]) + theme_light()+theme(aspect_ratio=ar,dpi=dpi,plot_margin=0,text=element_text(size=6), legend_key_size=5 ,legend_position='bottom',legend_direction='horizontal')) #+ facet_wrap('~ segid',dir='v') +guides(color=guide_legend(ncol=10)) if factor is None: plot=plot+geom_bar(stat=stat,width=0.5) else: plot=plot+geom_bar(stat=stat,width=0.5,mapping=aes(fill=factor),position=bar_position) if fill_params is not None: plot=plot+scale_fill_manual(**fill_params) if not usd: if (ymax is not None) : plot=plot+scale_y_continuous(limits=(None,ymax)) else: if (ymin is not None) : plot=plot+scale_y_continuous(limits=(ymin,None)) if ymax is None: ymax=data[column].max() if ymin is None: ymin=data[column].min() # print(ymax) plot = plot + geom_seq_x(seqimg=shaded.img,\ xlim=(1,sl+rof),ylim=(ymin,ymax),usd=usd,aspect_ratio=ar,transparent=transparent)+ggtitle(title) return plot
def test_dodge(): p = (ggplot(df2, aes('factor(z)')) + geom_bar(aes(fill='factor(x)'), position='dodge')) assert p + _theme == 'dodge'
def plot_line(data,nuclstr,columns=['value'],ymin=None,ymax=None,dpi=300,features=None,feature_types=['all'],add_features=[],funcgroups=None,shading_modes=['charge_functional'],right_overhang_fix=None,debug=False,startnumber=1,cropseq=(0,None),aspect_ratio=None,reverse_seq=False,transparent=True,xshift=0): """ A wrapper function to make a plot of data with bars along the sequnce input should be a dataframe with resid, segid column and 'value' This one is inspired by seqplot/seqplot/pdb_plot.py funcgroup example fg="\\funcgroup{xxx}{CT}{White}{Green}{upper}{up} \\funcgroup{xxx}{GA}{White}{Blue}{upper}{up}" """ if isinstance(columns,str): columns=[columns] segid=data['segid'].values[0] title="Segid: %s, Type: %s"%(segid,nuclstr.components[segid]['type']) seq=Seq(str(nuclstr.seqs[segid]['fullseq']),generic_protein \ if nuclstr.components[segid]['entity'] is 'DNA' or 'histone' or 'protein' else generic_dna) msar=MultipleSeqAlignment([SeqRecord(seq=seq,id=nuclstr.components[segid]['type']+':'+segid,\ name=nuclstr.components[segid]['type']+':'+segid)]) if(reverse_seq): logger.info("Experimental feature will reverse the sequence") msar[0].seq=msar[0].seq[::-1] msar=msar[:,cropseq[0]:cropseq[1]] # print("Seq to plot:",msar) #We need to get starting residue, currently for DNA chains only cifseq gets it correctly resid_start=nuclstr.seqs[segid]['resid_start'] logger.debug("Starting resid %d"%int(resid_start)) overhang=nuclstr.seqs[segid]['overhangL'] datafixed=data.copy() datafixed.loc[:,'resid']=datafixed.loc[:,'resid']-resid_start+overhang+1-cropseq[0]+xshift # print(datafixed) sl=len(msar[0].seq) # fn=shade.seqfeat2shadefeat(msar,feature_types=feature_types,force_feature_pos='bottom',debug=debug) if features is None: fn=nuclstr.shading_features[segid] else: fn=features fn2=[] for i in fn: if (i['style'] in feature_types) or ('all' in feature_types) : fn2.append(i) fn2.extend(add_features) shaded=ipyshade.shadedmsa4plot(msar,features=fn2,shading_modes=shading_modes,debug=debug,startnumber=startnumber,setends=[startnumber-2,sl+startnumber+2],funcgroups=funcgroups,density=200) #If sl%10=10 se will have a ruler number hanging beyond the sequence image, and we need to correct for that. if right_overhang_fix is None: if sl%10==0: if sl<100: rof= 0.1 else: rof=0.5 else: rof=0 else: rof=right_overhang_fix if (not aspect_ratio is None ): ar=aspect_ratio else: ar=0.15*100./sl md=pd.melt(datafixed,id_vars=['segid','resid'],value_vars=columns) # print(md) # print(md) # print(md['variable']) plot=(ggplot(data=md,mapping=aes(x='resid', y='value')) + geom_point(aes(color='variable'),size=0.1)+geom_line(aes(color='variable'),stat='identity') + scale_x_continuous(limits=(0.5,sl+0.5+rof),expand=(0,0.2),name='',breaks=[]) # + scale_y_continuous() + theme_light()+theme(aspect_ratio=ar,dpi=dpi,plot_margin=0)) #+ facet_wrap('~ segid',dir='v') if ymax is not None: plot=plot+scale_y_continuous(limits=(None,ymax)) if ymin is None: ymin=md['value'].min() if ymax is None: ymax=md['value'].max() plot = plot + geom_seq_x(seqimg=shaded.img,\ xlim=(1,sl+rof),ylim=(ymin,ymax),aspect_ratio=ar,transparent=transparent)+ggtitle(title) return plot
df_2.to_csv('/home/treelab/Documents/CUDAGP/script_GP1/graphs/mean_%s_%s.csv' % (df_new_2['popsize'][0], df_new_2['indsize'][0])) df_3 = df_new_3.groupby(['nrow', 'nvar'])['timewr'].mean() df_3.to_csv('/home/treelab/Documents/CUDAGP/script_GP1/graphs/mean_%s_%s.csv' % (df_new_3['popsize'][0], df_new_3['indsize'][0])) try: df_4 = df_new_4.groupby(['nrow', 'nvar'])['timewr'].mean() df_4.to_csv( '/home/treelab/Documents/CUDAGP/script_GP1/graphs/mean_%s_%s.csv' % (df_new_4['popsize'][0], df_new_4['indsize'][0])) except: print 'error' for ielem in (df_new_1, df_new_2, df_new_3, df_new_4): surveys_plot = ( p9.ggplot(data=ielem, mapping=p9.aes(x='run', y='timewr', color='factor(nvar)')) + p9.geom_point() + p9.facet_grid("~nrow") + p9.scale_y_continuous(limits=(0, 500)) + p9.scale_x_discrete(breaks=range(0, 35, 5)) + p9.theme(text=p9.element_text(size=10, family="serif"), plot_title=p9.element_text(weight='bold', size=14), legend_title=p9.element_text(weight='bold', size=14), legend_text=p9.element_text(weight='bold', size=10), axis_title_y=p9.element_text(weight='bold', size=14), axis_title_x=p9.element_text(weight='bold', size=14)) + p9.labs(y='Time (s)', x='Number of run', title='Population Size [%s]' % ielem['popsize'][0], color='Features')) # Cambiar a la direccion donde quieres guardarlos surveys_plot.save("./data_%s_%s.pdf" %
import pandas as pd titanic = pd.read_csv("/home/shaury/Downloads/nptel/titanic/train.csv", delimiter=",") test = pd.read_csv("/home/shaury/Downloads/nptel/titanic/test.csv") from plotnine import ggplot, aes, geom_bar t = titanic[titanic["Survived"] == 1] ggplot(t, aes(t["Pclass"], fill=t["Sex"])) + geom_bar() t = titanic[titanic["Survived"] == 0] ggplot(t, aes(t["Pclass"], fill=t["Sex"])) + geom_bar() t = titanic[titanic["Survived"] == 1] ggplot(t, aes(t["Pclass"], fill=t["Survived"])) + geom_bar() t = titanic[titanic["Survived"] == 0] ggplot(t, aes(t["Pclass"], fill=t["Survived"])) + geom_bar() o = titanic["Sex"] p = test["Sex"] l = [] for i in range(0, len(titanic)): if (o[i] == "female"): l.append(1) else: l.append(0) l = pd.DataFrame(l) l = pd.concat([l, titanic["Pclass"]], join='outer', axis=1) l l1 = [] for i in range(0, len(test)):
def htcalc(air_velocity_inside, air_velocity_outside, t_inside, t_outside, surface, layers, wall_thickness, thermal_conductivity): # We need the convective heat resistance on both sides of the wall res_conv_inside = heattransfer.convective_resistance( heattransfer.heat_transfer_coef(air_velocity_inside), surface) res_conv_outside = heattransfer.convective_resistance( heattransfer.heat_transfer_coef(air_velocity_outside), surface) # We need the total resistance over all wall layers total_layer_resistance = [] total_layer_resistance.append(res_conv_inside) for i in range(layers): total_layer_resistance.append( heattransfer.conductive_resistance(wall_thickness[i], thermal_conductivity[i], surface)) total_layer_resistance.append(res_conv_outside) total_resistance = sum(total_layer_resistance) heat_transfer = heattransfer.conduction(t_inside, t_outside, total_resistance) # Calculating the temperatures between each layer temperatures = [] temperatures.append(t_inside) layer_resistance = 0 for resistance in total_layer_resistance: layer_resistance += resistance temperatures.append( heattransfer.layer_temperature(heat_transfer, layer_resistance, t_inside)) # Preparing the x axis, position of the temperature and transition labels for the graph position = [0, 0.02] labels = ['fluid inside', 'inner surface'] i = 0 for entry in wall_thickness: position.append(position[-1] + entry) i += 1 labels.append("layer" + str(i)) labels[-1] = "outer surface" position.append(position[-1] + 0.02) labels.append("fluid outside") # print(f"\nThe total resistance is {round(total_resistance, 2)} K/W") # print(f"Total heat transfer from inside to outside is {round(heat_transfer, 2)} W\n") df = pd.DataFrame({'pos': position, 'temp': temperatures}) gg = p9.ggplot(df, p9.aes(x='pos', y='temp')) gg += p9.geom_line(p9.aes(color='temp'), size=2) for ws in df.pos.values.tolist(): gg += p9.geom_vline(xintercept=ws, color='grey') # gg += p9.geom_hline(yintercept=110, color='red', size=2, alpha=0.8) gg += p9.ggtitle('heat transfer through wall') gg += p9.scale_x_continuous(name='Position', breaks=df.pos.values.tolist(), labels=labels) gg += p9.scale_y_continuous(name='Temperature') gg += p9.theme(axis_text_x=p9.element_text(angle=45)) gg += p9.scale_colour_gradient(low="yellow", high="orange") i = 0 for temp in temperatures: gg += p9.geom_text( p9.aes(x=position[i], y=temp + 30, label=round(temp, 2))) i += 1 for i in range(layers): labtext = 'Thermal cond.: ' + str( thermal_conductivity[i]) + ' [W/m°K]\nLayer thickness: ' + str( round(wall_thickness[i], 3)) + ' [m]' gg += p9.annotate(geom='text', x=((position[i + 2] - position[i + 1]) / 2) + position[i + 1], y=temperatures[i] + 30, label=labtext, color='blue') return gg
def clone_rarefaction(self, groupby, clone_key=None, palette=None, figsize=(6, 4), save=None): """ Plots rarefaction curve for cell numbers vs clone size. Parameters ---------- self : AnnData `AnnData` object. groupby : str Column name to split the calculation of clone numbers for a given number of cells for e.g. sample, patient etc. clone_key : str, optional Column name specifying the clone_id column in metadata/obs. palette : sequence, optional Color mapping for unique elements in groupby. Will try to retrieve from AnnData `.uns` slot if present. figsize : tuple[float, float] Size of plot. save : str, optional Save path. Returns ------- rarefaction curve plot. """ if self.__class__ == AnnData: metadata = self.obs.copy() if clone_key is None: clonekey = 'clone_id' else: clonekey = clone_key groups = list(set(metadata[groupby])) metadata = metadata[metadata['bcr_QC_pass'].isin([True, 'True'])] metadata[clonekey] = metadata[clonekey].cat.remove_unused_categories() res = {} for g in groups: _metadata = metadata[metadata[groupby] == g] res[g] = _metadata[clonekey].value_counts() res_ = pd.DataFrame.from_dict(res, orient='index') # remove those with no counts rowsum = res_.sum(axis=1) print( 'removing due to zero counts:', ', '.join( [res_.index[i] for i, x in enumerate(res_.sum(axis=1) == 0) if x])) sleep(0.5) res_ = res_[~(res_.sum(axis=1) == 0)] # set up for calculating rarefaction tot = res_.apply(sum, axis=1) S = res_.apply(lambda x: x[x > 0].shape[0], axis=1) nr = res_.shape[0] # append the results to a dictionary rarecurve = {} for i in tqdm(range(0, nr), desc='Calculating rarefaction curve '): n = np.arange(1, tot[i], step=10) if (n[-1:] != tot[i]): n = np.append(n, tot[i]) rarecurve[res_.index[i]] = [ rarefun(np.array(res_.iloc[i, ]), z) for z in n ] y = pd.DataFrame([rarecurve[c] for c in rarecurve]).T pred = pd.DataFrame( [np.append(np.arange(1, s, 10), s) for s in res_.sum(axis=1)], index=res_.index).T y = y.melt() pred = pred.melt() pred['yhat'] = y['value'] options.figure_size = figsize if palette is None: if self.__class__ == AnnData: try: pal = self.uns[str(groupby) + '_colors'] except: if len(list(set((pred.variable)))) <= 20: pal = palettes.default_20 elif len(list(set((pred.variable)))) <= 28: pal = palettes.default_28 elif len(list(set((pred.variable)))) <= 102: pal = palettes.default_102 else: pal = None if pal is not None: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=groupby) + scale_color_manual(values=(pal)) + geom_line()) else: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=groupby) + geom_line()) else: if len(list(set((pred.variable)))) <= 20: pal = palettes.default_20 elif len(list(set((pred.variable)))) <= 28: pal = palettes.default_28 elif len(list(set((pred.variable)))) <= 102: pal = palettes.default_102 else: pal = None if pal is not None: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=groupby) + scale_color_manual(values=(pal)) + geom_line()) else: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=groupby) + geom_line()) else: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=groupby) + geom_line()) if save: p.save(filename='figures/rarefaction' + str(save), height=plt.rcParams['figure.figsize'][0], width=plt.rcParams['figure.figsize'][1], units='in', dpi=plt.rcParams["savefig.dpi"]) return (p)
def test_calculated_expressions(): p = (ggplot(mtcars, aes(x='factor(cyl)', y='..count..+1')) + geom_bar()) # No exception p._build()
def test_dir_v_ncol(): p = (ggplot(mpg) + aes(x='displ', y='hwy') + facet_wrap('class', dir='v', ncol=4, as_table=False) + geom_point()) assert p == 'dir_v_ncol'
def PlotPG(X, TargetPG, BootPG=None, PGCol="", PlotProjections="none", GroupsLab=None, PointViz="points", Main='', p_alpha=.3, PointSize=None, NodeLabels=None, LabMult=1, Do_PCA=True, DimToPlot=[0, 1], VizMode=("Target", "Boot")): ''' work in progress, only basic plotting supported #' Plot data and principal graph(s) #' #' @param X numerical 2D matrix, the n-by-m matrix with the position of n m-dimensional points #' @param TargetPG the main principal graph to plot #' @param BootPG A list of principal graphs that will be considered as bostrapped curves #' @param PGCol string, the label to be used for the main principal graph #' @param PlotProjections string, the plotting mode for the node projection on the principal graph. #' It can be "none" (no projections will be plotted), "onNodes" (the projections will indicate how points are associated to nodes), #' and "onEdges" (the projections will indicate how points are projected on edges or nodes of the graph) #' @param GroupsLab factor or numeric vector. A vector indicating either a category or a numeric value associted with #' each data point #' @param PointViz string, the modality to show points. It can be 'points' (data will be represented a dot) or #' 'density' (the data will be represented by a field) #' @param Main string, the title of the plot #' @param p.alpha numeric between 0 and 1, the alpha value of the points. Lower values will prodeuce more transparet points #' @param PointSize numeric vector, a vector indicating the size to be associted with each node of the graph. #' If NA points will have size 0. #' @param NodeLabels string vector, a vector indicating the label to be associted with each node of the graph #' @param LabMult numeric, a multiplier controlling the size of node labels #' @param Do_PCA bolean, should the node of the principal graph be used to derive principal component projections and #' rotate the space? If TRUE the plots will use the "EpG PC" as dimensions, if FALSE, the original dimensions will be used. #' @param DimToPlot a integer vector specifing the PCs (if Do_PCA=TRUE) or dimension (if Do_PCA=FALSE) to plot. All the #' combination will be considered, so, for example, if DimToPlot = 1:3, three plot will be produced. #' @param VizMode vector of string, describing the ElPiGraphs to visualize. Any combination of "Target" and "Boot". #' #' @return #' @export #' #' @examples''' if len(PGCol) == 1: PGCol = [PGCol] * len(TargetPG['NodePositions']) if GroupsLab is None: GroupsLab = ["N/A"] * len(X) # levels(GroupsLab) = c(levels(GroupsLab), unique(PGCol)) if PointSize is not None: if (len(PointSize) == 1): PointSize = [PointSize] * len(TargetPG['NodePositions']) if (Do_PCA): # Perform PCA on the nodes mv = TargetPG['NodePositions'].mean(axis=0) data_centered = TargetPG['NodePositions'] - mv vglobal, NodesPCA, explainedVariances = PCA(data_centered) # Rotate the data using eigenvectors BaseData = np.dot((X - mv), vglobal) DataVarPerc = np.var(BaseData, axis=0) / np.sum(np.var(X, axis=0)) else: NodesPCA = TargetPG['NodePositions'] BaseData = X DataVarPerc = np.var(X, axis=0) / np.sum(np.var(X, axis=0)) # Base Data AllComb = list(combinations(DimToPlot, 2)) PlotList = list() for i in range(len(AllComb)): Idx1 = AllComb[i][0] Idx2 = AllComb[i][1] df1 = pd.DataFrame.from_dict( dict(PCA=BaseData[:, Idx1], PCB=BaseData[:, Idx2], Group=GroupsLab)) # Initialize plot Initialized = False if (PointViz == "points"): p = (plotnine.ggplot(data=df1, mapping=plotnine.aes(x='PCA', y='PCB')) + plotnine.geom_point(alpha=p_alpha, mapping=plotnine.aes(color='Group'))) Initialized = True if (PointViz == "density"): p = (plotnine.ggplot(data=df1, mapping=plotnine.aes(x='PCA', y='PCB')) + plotnine.stat_density_2d( contour=True, alpha=.5, geom='polygon', mapping=plotnine.aes(fill='..level..'))) Initialized = True # p = sns.kdeplot(df1['PCA'], df1['PCB'], cmap="Reds", shade=True, bw=.15) if (not Initialized): raise ValueError("Invalid point representation selected") # Target graph tEdg = dict(x=[], y=[], xend=[], yend=[], Col=[]) for i in range(len(TargetPG['Edges'][0])): Node_1 = TargetPG['Edges'][0][i][0] Node_2 = TargetPG['Edges'][0][i][1] if PGCol: if (PGCol[Node_1] == PGCol[Node_2]): tCol = "ElPiG" + str(PGCol[Node_1]) if (PGCol[Node_1] != PGCol[Node_2]): tCol = "ElPiG Multi" if (any(PGCol[(Node_1, Node_2)] == "None")): tCol = "ElPiG None" tEdg['x'].append(NodesPCA[Node_1, Idx1]) tEdg['y'].append(NodesPCA[Node_1, Idx2]) tEdg['xend'].append(NodesPCA[Node_2, Idx1]) tEdg['yend'].append(NodesPCA[Node_2, Idx2]) if PGCol: tEdg['Col'].append(tCol) else: tEdg['Col'].append(1) if (Do_PCA): TarPGVarPerc = explainedVariances.sum() / explainedVariances.sum( ) * 100 else: TarPGVarPerc = np.var(TargetPG['NodePositions'], axis=0) / np.sum( np.var(TargetPG['NodePositions'], axis=0)) df2 = pd.DataFrame.from_dict(tEdg) # Replicas # if(BootPG is not None) and ("Boot" is in VizMode): # AllEdg = lapply(1:length(BootPG), function(i){ # tTree = BootPG[[i]] # if(Do_PCA): # RotData = t(t(tTree$NodePositions) - NodesPCA$center) %*% NodesPCA$rotation # else: { # RotData = tTree$NodePositions # } # tEdg = t(sapply(1:nrow(tTree$Edges$Edges), function(i){ # c(RotData[tTree$Edges$Edges[i, 1],c(Idx1, Idx2)], RotData[tTree$Edges$Edges[i, 2],c(Idx1, Idx2)]) # })) # cbind(tEdg, i) # }) # AllEdg = do.call(rbind, AllEdg) # df3 = data.frame(x = AllEdg[,1], y = AllEdg[,2], xend = AllEdg[,3], yend = AllEdg[,4], Rep = AllEdg[,5]) # p = p + plotnine.geom_segment(data = df3, mapping = plotnine.aes(x=x, y=y, xend=xend, yend=yend), # inherit.aes = False, alpha = .2, color = "black") # Plot projections if (PlotProjections == "onEdges"): if (Do_PCA): Partition = PartitionData(X=BaseData, NodePositions=NodesPCA, MaxBlockSize=100000000, SquaredX=np.sum(BaseData**2, axis=1, keepdims=1), TrimmingRadius=float('inf'))[0] OnEdgProj = project_point_onto_graph(X=BaseData, NodePositions=NodesPCA, Edges=TargetPG['Edges'], Partition=Partition) else: Partition = PartitionData( X=BaseData, NodePositions=TargetPG['NodePositions'], MaxBlockSize=100000000, SquaredX=np.sum(BaseData**2, axis=1, keepdims=1), TrimmingRadius=float('inf'))[0] OnEdgProj = project_point_onto_graph( X=BaseData, NodePositions=TargetPG['NodePositions'], Edges=TargetPG['Edges'], Partition=Partition) ProjDF = pd.DataFrame.from_dict( dict(X=BaseData[:, Idx1], Y=BaseData[:, Idx2], Xend=OnEdgProj['X_projected'][:, Idx1], Yend=OnEdgProj['X_projected'][:, Idx2], Group=GroupsLab)) p = p + plotnine.geom_segment( data=ProjDF, mapping=plotnine.aes( x='X', y='Y', xend='Xend', yend='Yend', col='Group'), inherit_aes=False) elif (PlotProjections == "onNodes"): if (Do_PCA): Partition = PartitionData(X=BaseData, NodePositions=NodesPCA, MaxBlockSize=100000000, SquaredX=np.sum(BaseData**2, axis=1, keepdims=1), TrimmingRadius=float('inf'))[0] ProjDF = pd.DataFrame.from_dict( dict(X=BaseData[:, Idx1], Y=BaseData[:, Idx2], Xend=NodesPCA[Partition, Idx1], Yend=NodesPCA[Partition, Idx2], Group=GroupsLab)) else: Partition = PartitionData( X=BaseData, NodePositions=TargetPG['NodePositions'], MaxBlockSize=100000000, SquaredX=np.sum(BaseData**2, axis=1, keepdims=1), TrimmingRadius=float('inf'))[0] ProjDF = pd.DataFrame.from_dict( dict(X=BaseData[:, Idx1], Y=BaseData[:, Idx2], Xend=TargetPG['NodePositions'][Partition, Idx1], Yend=TargetPG['NodePositions'][Partition, Idx2], Group=GroupsLab)) p = p + plotnine.geom_segment( data=ProjDF, mapping=plotnine.aes( x='X', y='Y', xend='Xend', yend='Yend', col='Group'), inherit_aes=False, alpha=.3) if ("Target" in VizMode): if GroupsLab is not None: p = p + plotnine.geom_segment( data=df2, mapping=plotnine.aes( x='x', y='y', xend='xend', yend='yend', col='Col'), inherit_aes=True) + plotnine.labs(linetype="") else: p = p + plotnine.geom_segment( data=df2, mapping=plotnine.aes( x='x', y='y', xend='xend', yend='yend'), inherit_aes=False) if (Do_PCA): df4 = pd.DataFrame.from_dict( dict(PCA=NodesPCA[:, Idx1], PCB=NodesPCA[:, Idx2])) else: df4 = pd.DataFrame.from_dict( dict(PCA=TargetPG['NodePositions'][:, Idx1], PCB=TargetPG['NodePositions'][:, Idx2])) if ("Target" in VizMode): if (PointSize is not None): p = p + plotnine.geom_point(mapping=plotnine.aes( x='PCA', y='PCB', size=PointSize), data=df4, inherit_aes=False) else: p = p + plotnine.geom_point(mapping=plotnine.aes(x='PCA', y='PCB'), data=df4, inherit_aes=False) # if(NodeLabels): # if(Do_PCA){ # df4 = data.frame(PCA = NodesPCA$x[,Idx1], PCB = NodesPCA$x[,Idx2], Lab = NodeLabels) # else { # df4 = data.frame(PCA = TargetPG$NodePositions[,Idx1], PCB = TargetPG$NodePositions[,Idx2], Lab = NodeLabels) # } # p = p + plotnine.geom_text(mapping = plotnine.aes(x = PCA, y = PCB, label = Lab), # data = df4, hjust = 0, # inherit.aes = False, na.rm = True, # check_overlap = True, color = "black", size = LabMult) # } # if(Do_PCA){ # LabX = "EpG PC", Idx1, " (Data var = ", np.round(100*DataVarPerc[Idx1], 3), "% / PG var = ", signif(100*TarPGVarPerc[Idx1], 3), "%)" # LabY = "EpG PC", Idx2, " (Data var = ", np.round(100*DataVarPerc[Idx2], 3), "% / PG var = ", signif(100*TarPGVarPerc[Idx2], 3), "%)" # else { # LabX = paste0("Dimension ", Idx1, " (Data var = ", np.round(100*DataVarPerc[Idx1], 3), "% / PG var = ", np.round(100*TarPGVarPerc[Idx1], 3), "%)") # LabY = paste0("Dimension ", Idx2, " (Data var = ", np.round(100*DataVarPerc[Idx2], 3), "% / PG var = ", np.round(100*TarPGVarPerc[Idx2], 3), "%)") # } # if(!is.na(TargetPG$FinalReport$FVEP)){ # p = p + plotnine.labs(x = LabX, # y = LabY, # title = paste0(Main, # "/ FVE=", # signif(as.numeric(TargetPG$FinalReport$FVE), 3), # "/ FVEP=", # signif(as.numeric(TargetPG$FinalReport$FVEP), 3)) # ) + # plotnine.theme(plot.title = plotnine.element_text(hjust = 0.5)) # else { # p = p + plotnine.labs(x = LabX, # y = LabY, # title = paste0(Main, # "/ FVE=", # signif(as.numeric(TargetPG$FinalReport$FVE), 3)) # ) + # plotnine.theme(plot.title = plotnine.element_text(hjust = 0.5)) # } PlotList.append(p) return (PlotList)