def calculate_ndvi_and_cloud_percent_for_the_parcel(df_ext, cloud_categories): # we make a copy first of the dataframe passed to this function to avoid changing the original # dataframe df = df_ext.copy() # Convert the epoch timestamp to a datetime df['date_part']=df['date_part'].map(lambda e: datetime.datetime.fromtimestamp(e)) df['cloud_pct'] = df['hist'].apply(lambda s: get_cloudyness(s, cloud_categories)[1]) bands = ['B04', 'B08'] # Check if extraction exists for these bands 4 and 8 for NDVI calculation, otherwise quit length_of_band0 = len(df[df['band']==bands[0]]) length_of_band1 = len(df[df['band']==bands[1]]) if length_of_band0>0 and length_of_band1>0: # Treat each band separately. df0 = df[df['band']==bands[0]][['date_part', 'mean', 'count', 'std', 'cloud_pct', 'reference']] df1 = df[df['band']==bands[1]][['date_part', 'mean', 'count', 'std', 'cloud_pct', 'reference']] # Merge back into one DataFrame based on reference that should be unique dff = pd.merge(df0, df1, on = 'reference', suffixes = (bands[0], bands[1])) dff['ndvi'] = (dff[f"mean{bands[1]}"]-dff[f"mean{bands[0]}"])/(dff[f"mean{bands[1]}"]+dff[f"mean{bands[0]}"]) dff['utm_number'] = dff['reference'].apply(lambda s: get_utm_number_from_reference(s)) dff['ndvi_std'] = dff.apply(lambda x: calculate_ndvi_std_from_band_mean_and_std(x.meanB04,x.meanB08,x.stdB04,x.stdB08), axis=1) pd.set_option('precision', 3) pd.set_eng_float_format(accuracy=3) return dff else: return pd.DataFrame()
def t_test(data=None, independent=None, dependent=None): pd.set_eng_float_format(accuracy=3, use_eng_prefix=False) independent_groups = pd.unique(data[independent]) if len(independent_groups)>2: print('There are more than 2 groups in the independent variable') print('t-test is not the correct statistical test to run in that circumstance,') print('consider running an ANOVA') return mct = parammct(data=data, independent=independent, dependent=dependent) t_test_value, p_value = stats.ttest_ind(data[dependent][data[independent] == independent_groups[0]], data[dependent][data[independent] == independent_groups[1]]) difference_mean = np.abs(mct.loc['Mean'][0] - mct.loc['Mean'][1]) pooled_sd = np.sqrt( ( ((mct.loc['n'][0]-1)*mct.loc['SD'][0]**2) + ((mct.loc['n'][1]-1)*mct.loc['SD'][1]**2) ) / (mct.loc['n'][0] + mct.loc['n'][1] - 2) ) sedifference = pooled_sd * np.sqrt( (1/mct.loc['n'][0]) + (1/mct.loc['n'][1]) ) difference_mean_ci1 = difference_mean + (t_test_value * sedifference) difference_mean_ci2 = difference_mean - (t_test_value * sedifference) if difference_mean_ci1>difference_mean_ci2: difference_mean_cilower = difference_mean_ci2 difference_mean_ciupper = difference_mean_ci1 else: difference_mean_cilower = difference_mean_ci1 difference_mean_ciupper = difference_mean_ci2 cohend = difference_mean / pooled_sd t_test_result= pd.DataFrame ([difference_mean, sedifference, t_test_value, p_value, difference_mean_cilower, difference_mean_ciupper, cohend], index = ['Difference between means', 'SE difference', 't-test', 'p-value', 'Lower bound difference CI', 'Upper bound difference CI', 'Cohen\'s d'], columns=['Value']) return t_test_result
def chi_square(data=None, variable1=None, variable2=None): pd.set_eng_float_format(accuracy=3, use_eng_prefix=False) variable1 = str(variable1) variable2 = str(variable2) if input_check_categorical_categorical(data, variable1, variable2): return values_var1=pd.unique(data[variable1]) values_var2=pd.unique(data[variable2]) problem_found=False for variable in [values_var1, values_var2]: if len(variable)<2: print(variable, 'has less than two categories. It has:', len(variable)) problem_found=True if problem_found: return contingency_table = pd.crosstab(data[variable1], data[variable2]) contingency_table = pd.DataFrame(contingency_table) display(Markdown('**Contingency Table**')) display(contingency_table) chi2_test=stats.chi2_contingency(contingency_table, correction=False) chi2_result = pd.Series ([chi2_test[0], chi2_test[1], chi2_test[2], chi2_test[3]], index = ['Chi-square value', 'p-value', 'Degrees of freedom', 'Expected frequencies']) chi2_result = pd.DataFrame(chi2_result, columns=['Value']) display(Markdown('**Results Chi-square test**')) display(chi2_result) return
def logistic_reg(data=None, independent=None, dependent=None): pd.set_eng_float_format(accuracy=3, use_eng_prefix=False) independent = str(independent) dependent = str(dependent) if input_check_categorical(data, independent, dependent): return if not len(pd.unique(data[dependent]))==2: print('Dependent variable must have two categories') print(dependent, 'variable has', len(pd.unique(data[dependent])), 'categories') return data['interceptant']=1 independent=[independent, 'interceptant'] logReg = sm.Logit(data[dependent], data[independent]) regression = logReg.fit() display(regression.summary()) display(Markdown('**Coefficients confidence intervals**')) display(regression.conf_int()) predicted_values =regression.predict() plt.plot(data[independent[0]], data[dependent], 'o', label='Actual values') plt.plot(data[independent[0]], predicted_values, 'ok', label='Predicted probabilities') plt.xlabel(independent[0], fontsize=14) plt.ylabel('Probability '+dependent, fontsize=14) plt.ylim(-0.05, 1.05) plt.legend() plt.show() return
def printStats(players): players_sorted = sorted([players[name] for name in players], key=lambda p: p.totalP) d = {} for player in reversed(players_sorted[len(players_sorted)-25:]): d[player.name] = [player.totalP, player.avgP, player.medianP, player.stddevP, player.totalT, player.avgT, player.stddevT] pd.set_eng_float_format(accuracy=1, use_eng_prefix=True) idx = ["T", "A", "M","V","TT", "AT", "VT"] print pd.DataFrame(d, index = idx).transpose().sort("T", ascending=False)
def tukey(data=None, independent=None, dependent=None): pd.set_eng_float_format(accuracy=3, use_eng_prefix=False) independent = str(independent) dependent = str(dependent) if input_check_numerical_categorical(data, independent, dependent): return test = multi.MultiComparison(data[dependent], data[independent]) res = test.tukeyhsd() display(res.summary()) res.plot_simultaneous() return
def estimate_relative_error_in_nominal_capacitance(df): # Calculate the relative percentage difference in the mean capacitance # values measured relative to the nominal values. cleaned_df = df.dropna().copy() C_relative_error = (cleaned_df.groupby('test_capacitor').apply(lambda x: ( (x['C'] - x['test_capacitor']) / x['test_capacitor']).describe())) pd.set_eng_float_format(accuracy=1, use_eng_prefix=True) print( 'Estimated relative error in nominal capacitance values = %.1f%% ' ' +/-%.1f%%' % (C_relative_error['mean'].mean() * 100, C_relative_error['mean'].std() * 100)) print C_relative_error[['mean', 'std']] * 100 print return C_relative_error
def estimate_relative_error_in_nominal_capacitance(df): # Calculate the relative percentage difference in the mean capacitance # values measured relative to the nominal values. cleaned_df = df.dropna().copy() C_relative_error = (cleaned_df.groupby('test_capacitor') .apply(lambda x: ((x['C'] - x['test_capacitor']) / x['test_capacitor']).describe())) pd.set_eng_float_format(accuracy=1, use_eng_prefix=True) print ('Estimated relative error in nominal capacitance values = %.1f%% ' ' +/-%.1f%%' % (C_relative_error['mean'].mean() * 100, C_relative_error['mean'].std() * 100)) print C_relative_error[['mean', 'std']] * 100 print return C_relative_error
def anova(data=None, independent=None, dependent=None): pd.set_eng_float_format(accuracy=3, use_eng_prefix=False) independent = str(independent) dependent = str(dependent) if input_check_numerical_categorical(data, independent, dependent): return formula = dependent + ' ~ ' + independent model = ols(formula, data=data).fit() aov_table = sm.stats.anova_lm(model, typ=2) aov_table.rename(columns={'PR(>F)':'p'}, inplace=True) aov_table['F'] = pd.Series([aov_table['F'][0], ''], index = [independent, 'Residual']) aov_table['p'] = pd.Series([aov_table['p'][0], ''], index = [independent, 'Residual']) eta_sq = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1]) aov_table['Eta squared'] = pd.Series([eta_sq, ''], index = [independent, 'Residual']) return aov_table
import textwrap import numpy as np import pandas as pd from matplotlib import pylab as plt from matplotlib.gridspec import GridSpec from matplotlib import ticker from matplotlib.ticker import MaxNLocator from mpl_toolkits.axes_grid.anchored_artists import AnchoredText from .kplot import tprop from .. import tfind from .. import tval pd.set_eng_float_format(accuracy=3,use_eng_prefix=True) plt.rc('axes',color_cycle=['RoyalBlue','Tomato']) plt.rc('font',size=8) def print_traceback(f): """ Decorator so that we can fail gracefully from a plotting mishap """ def wrapper_function(*args, **kwargs): try: return f(*args, **kwargs) except Exception: ax = plt.gca() error = traceback.format_exc() print(error) error = textwrap.fill(error,50) ax.text(0, 1, error, transform=ax.transAxes, va='top')
def write_csv(self): if self.outfile: pd.set_eng_float_format(accuracy=3, use_eng_prefix=True) self.result.to_csv(self.outfile,encoding='utf-8',index=False) else: utils.log('No outfile specified!')
sep=' ', index=False, header=False, mode='a', encoding='utf-8', float_format='%.4f', index_label=None) # data.to_pickle(des,compression='zip') print('*' * 40) if __name__ == "__main__": print('program started at:', time.asctime(time.localtime(time.time())) ) #time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time())) pd.set_eng_float_format(7, True) pd.set_option('precision', 7) # pd.set_option('chop_threshold', .5) # source_addr = "/home/gjj/PycharmProjects/ADA/netsData/hackingData/GANdata/from_raw_change_scaler/2/data/Attack_free_dataset2.pkl" # dire_addr = "/home/gjj/PycharmProjects/ADA/netsData/hackingData/GANdata/from_raw_change_scaler/" # dire_addr = "/home/gjj/PycharmProjects/ADA/netsData/hackingData/GANdata/from_raw_change_scaler/2/data" # print('program start at:', time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time()))) # # print('data from :%s'%source_addr) # print() # os.chdir(os.path.dirname(dire_addr)) # dire_url = os.path.join(dire_addr, 'Attack_free_dataset_64.txt') # print("\ncurrent at:{}".format(os.getcwd())) # print() """pkl to txt"""
def main(routing_hdf_path, net_file_namebase): format_opts = dict(((k, pd.get_option(k)) for k in ('float_format', 'column_space'))) # Format floats to: # # * Avoid small float values being displayed as zero _(e.g., # critical-path-delay)_. # * Use engineering postfix to make it easier to compare values # at-a-glance _(e.g., `u` for micro, `n` for nano, etc.)_. pd.set_eng_float_format(accuracy=3, use_eng_prefix=True) h5f = ts.open_file(str(routing_hdf_path), 'r') # In our case, we need to first load the data from our `route_states` table # from the HDF file into a `pandas.DataFrame` instance. net_file_routings = getattr(h5f.root, net_file_namebase) data = np.array([v.fetch_all_fields() for v in net_file_routings.route_states], dtype=net_file_routings.route_states.dtype) routing_results = pd.DataFrame(data) h5f.close() string_io = StringIO.StringIO() indent = 4 * ' ' print >> string_io, '# [%s] Routing results summary #\n' % net_file_namebase _min_success_data = min_success_data(routing_results) if len(_min_success_data) > 1: min_success_summary = _min_success_data.describe() elif len(_min_success_data) == 1: min_success_summary = _min_success_data.iloc[0] print >> string_io, '## Minimum routable channel-width summary ##\n' print >> string_io, prefix_lines(min_success_summary, indent) print >> string_io, '\n' + 70 * '-' + '\n' _max_failed_data = max_failed_data(routing_results) if len(_min_success_data) > 1: #max_failed_summary = _max_failed_data.describe().astype('i') max_failed_summary = _max_failed_data.describe() elif len(_min_success_data) == 1: max_failed_summary = _max_failed_data.iloc[0] print >> string_io, '## Maximum unroutable channel-width summary ##\n' print >> string_io, prefix_lines(max_failed_summary, indent) incomplete_routing_searches = np.where( min_success_max_failed_channel_width_diff(routing_results) != 1) if len(incomplete_routing_searches[0]): print >> string_io, 'Incomplete routings:' print >> string_io, '\n'.join([' * `%s`' % pformat(routing_results ['block_positions_sha1'][i]) for i in incomplete_routing_searches[0]]) print >> string_io, '\n' + 70 * '-' + '\n' print >> string_io, ('## Missing routability result routing configurations' ' ##\n') print >> string_io, '\n'.join([' * `%s`' % pformat(v) for v in missing_routability_result_configs (routing_results)]) print >> string_io, '\n' + 70 * '-' + '\n\n' for k, v in format_opts.iteritems(): if v is not None: pd.set_option(k, v) return string_io.getvalue(), routing_results
#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # In[2]: pd.set_eng_float_format(accuracy=4) # In[3]: train = pd.read_csv(r'E:\Users\quadr\Documents\datascience-arquivos\Kaggle\Ashrae_Energy_Prediction\train.csv') w_train = pd.read_csv(r'E:\Users\quadr\Documents\datascience-arquivos\Kaggle\Ashrae_Energy_Prediction\weather_train.csv') # In[4]: b_meta = pd.read_csv(r'E:\Users\quadr\Documents\datascience-arquivos\Kaggle\Ashrae_Energy_Prediction\building_metadata.csv')
def eng(): import pandas as pd pd.set_eng_float_format(accuracy=3, use_eng_prefix=True) pd.options.display.float_format = '{:, .5f}'.format pd.set_option('precision', 7)
def set_format(): pd.set_eng_float_format(accuracy=2, use_eng_prefix=False)