def to_molecular(df: pd.DataFrame, renorm=True): """ Converts mass quantities to molar quantities of the same order. Parameters ----------- df : :class:`pandas.DataFrame` Dataframe to transform. renorm : :class:`bool`, :code:`True` Whether to renormalise the dataframe after converting to relative moles. Returns ------- :class:`pandas.DataFrame` Transformed dataframe. Notes ------ Does not convert units (i.e. mass% --> mol%; mass-ppm --> mol-ppm). """ # df = df.to_frame() MWs = [pt.formula(c).mass for c in df.columns] if renorm: return renormalise(df.div(MWs)) else: return df.div(MWs)
def test_arith_flex_series(self): df = self.simple row = df.xs('a') col = df['two'] # after arithmetic refactor, add truediv here ops = ['add', 'sub', 'mul', 'mod'] for op in ops: f = getattr(df, op) op = getattr(operator, op) assert_frame_equal(f(row), op(df, row)) assert_frame_equal(f(col, axis=0), op(df.T, col).T) # special case for some reason assert_frame_equal(df.add(row, axis=None), df + row) # cases which will be refactored after big arithmetic refactor assert_frame_equal(df.div(row), df / row) assert_frame_equal(df.div(col, axis=0), (df.T / col).T) # broadcasting issue in GH7325 df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64') expected = DataFrame([[nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) result = df.div(df[0], axis='index') assert_frame_equal(result, expected) df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64') expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) result = df.div(df[0], axis='index') assert_frame_equal(result, expected)
def propNoteGraph(data_test,b_u,b_i,mu,L,R): # Give the interesting graphic index_note = np.arange(1,6) count_1 = np.zeros([5,2]) count_2 = np.zeros([5,2]) notes = DataFrame(count_1,index=index_note,columns=['BON','MAUVAIS']) notes_naif = DataFrame(count_2,index=index_note,columns=['BON','MAUVAIS']) for r in range(data_test.shape[0]): # r_pred = round(mu + b_u[data_test.user_id.values[r]] + b_i[data_test.movie_id.values[r]] + X[data_test.user_id.values[r],data_test.movie_id.values[r]]) mean = mu + b_u[data_test[r,0]] + b_i[data_test[r,1]] r_pred = round(mean + np.dot(L[data_test[r,0],:],R[data_test[r,1],:])) r_pred = min(5,r_pred) r_pred = max(1,r_pred) r_true = int(round(mean+data_test[r,2])) r_naif = round(mean) if r_naif==r_true: notes_naif.BON[r_true]+=1 else: notes_naif.MAUVAIS[r_true]+=1 if r_pred==r_true: notes.BON[r_true]+=1 else: notes.MAUVAIS[r_pred]+=1 notes_naif_prop = notes_naif.div(notes_naif.sum(1),axis=0) notes_prop = notes.div(notes.sum(1),axis=0) notes_naif_VS_algo = pd.concat([notes_prop.BON,notes_naif_prop.BON], axis=1) notes_naif_VS_algo.columns = ['ALGO','NAIF'] return notes_naif_VS_algo
def hmm_build(alphabet, aln, threshold, sigma): '''Given alphabet, multiple alignment aln, insertion threshold and pseudocount sigma, return the profile HMM transition and emission matrix.''' aln_cols = list(zip(*(aln))) m, n = len(aln), len(aln_cols) # m sequences, n columns # indices of columns where '-' count is below threshold match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold] # state names k = len(match_cols) # k states states_ = ['M{0} D{0} I{0}'.format(i).split() for i in range(1, k + 1)] states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E'] # building matrices transitions = DataFrame(data=0.0, columns=states, index=states) emissions = DataFrame(data=0.0, columns=alphabet, index=states) for seq in aln: # iterate through each sequence state_ix = 0 last_state = 'S' for i in range(n): if i in match_cols: state_ix += 1 if seq[i] != '-': current_state = 'M' + str(state_ix) emissions.loc[current_state, seq[i]] += 1 else: current_state = 'D' + str(state_ix) transitions.loc[last_state, current_state] += 1 last_state = current_state elif seq[i] != '-': current_state = 'I' + str(state_ix) transitions.loc[last_state, current_state] += 1 emissions.loc[current_state, seq[i]] += 1 last_state = current_state transitions.loc[last_state, 'E'] += 1 # scale rows to [0, 1] transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3) emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3) #add pseudocounts transitions.iloc[:2, 1:4] += sigma transitions.iloc[-4:-1, -2:] += sigma for i in range(k): transitions.iloc[i*3-1:i*3+2, i*3+1:i*3+4] += sigma emissions.iloc[i*3+1:i*3+3, :] += sigma emissions.iloc[-2, :] += sigma # scale again transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3) emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3) return transitions, emissions
def hmm_build(alphabet, aln, threshold, sigma): '''Given alphabet, multiple alignment aln, insertion threshold and pseudocount sigma, return the profile HMM transition and emission matrix.''' aln_cols = list(zip(*(aln))) m, n = len(aln), len(aln_cols) # m sequences, n columns # indices of columns where '-' count is below threshold match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold] # state names k = len(match_cols) # there k M-states states_ = [('M'+ str(i), 'D' + str(i), 'I' + str(i)) for i in range(1, k + 1)] states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E'] # building matrices transitions = DataFrame(data=0.0, index=states, columns=states) emissions = DataFrame(data=0.0, index=states, columns=alphabet) for seq in aln: # iterate through each sequence state_ix = 0 last_state = 'S' for i in range(n): if i in match_cols: state_ix += 1 if seq[i] != '-': current_state = 'M' + str(state_ix) emissions.loc[current_state, seq[i]] += 1 else: current_state = 'D' + str(state_ix) transitions.loc[last_state, current_state] += 1 last_state = current_state elif seq[i] != '-': current_state = 'I' + str(state_ix) transitions.loc[last_state, current_state] += 1 emissions.loc[current_state, seq[i]] += 1 last_state = current_state transitions.loc[last_state, 'E'] += 1 # scale rows to [0, 1] transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3) emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3) #add pseudocounts transitions.iloc[:2, 1:4] += sigma transitions.iloc[-4:-1, -2:] += sigma for i in range(k): transitions.iloc[i*3-1:i*3+2, i*3+1:i*3+4] += sigma emissions.iloc[i*3+1:i*3+3, :] += sigma emissions.iloc[-2, :] += sigma # scale again transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0) + 1e-100 emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0) + 1e-100 return transitions, emissions, states, k
def to_molecular(df: pd.DataFrame, renorm=True): """ Converts mass quantities to molar quantities of the same order. E.g.: mass% --> mol% mass-ppm --> mol-ppm """ MWs = [pt.formula(c).mass for c in df.columns] if renorm: return renormalise(df.div(MWs)) else: return df.div(MWs)
def word_vector(self, strx, stry): NVframe = DataFrame(self.data_oversam,columns=[strx, stry]) NVframe[u'case']= NVframe[strx]+'_'+NVframe[stry] casecounts = NVframe[u'case'].value_counts() NVframe = NVframe.reset_index() del NVframe[u'index'] Count_ob = Count() count = Count_ob.casecount(NVframe, casecounts) NVframe[u'count']= count NVframe = NVframe[NVframe[u'case'].notnull()] NVframe = NVframe.drop_duplicates() NVframe=NVframe.set_index([strx, stry]) del NVframe[u'case'] NVframe = NVframe.unstack() NVframe = NVframe.fillna(0) NVframe.columns = NVframe.columns.get_level_values(1) NVframe = NVframe.div(NVframe.sum(1),axis=0) #NVframeが共起頻度行列 #標準化処理 SVD_ob = SVD() Uframe,Vframe,Sframe = SVD_ob.SVD_run(NVframe) Sframe.plot() plt.plot( Sframe, 'o') #print Sframe #print Vframe m = Uframe.mean(0) s = Uframe.std(0) nd = Uframe nd = Uframe.sub(m,axis=1).div(s,axis=1) SN = SVD_ob.sf(Sframe) return nd, SN
def min_max_scale_df(df: pd.DataFrame) -> pd.DataFrame: """ Scales the data frame values between 0 and 1 across the columns allowing for easier comparison of line shape on plots :param df: data frame to be scaled :return: scaled dataframe """ return df.div(df.max(), axis=1)
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Divide dataframe by reference feature column. Parameters ---------- X : Pandas DataFrame of shape = [n_samples, n_features] The data to be transformed. Raises ------ TypeError If the input is not a Pandas DataFrame ValueError - If the dataframe not of the same size as that used in fit(). Returns ------- X : pandas dataframe The dataframe with the transformed variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _to_dataframe(X) # Check if input data contains same number of columns as dataframe used to fit. _ensure_ncols(X, self.input_shape_[1]) # transform X = X.div(self.scaling_factors_, axis=0) return X
def hmm_build(alphabet, aln, threshold): '''given alphabet, multiple alignment aln, and insertion threshold, return the profile HMM transition and emission matrix.''' aln_cols = list(zip(*(aln))) m, n = len(aln), len(aln_cols) # m sequences, n columns # indices of columns where '-' count is below threshold match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold] # state names k = len(match_cols) # k states states_ = [('M'+ str(i), 'D' + str(i), 'I' + str(i)) for i in range(1, k + 1)] states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E'] # building matrices transitions = DataFrame(data=0.0, columns=states, index=states) emissions = DataFrame(data=0.0, columns=alphabet, index=states) for seq in aln: # iterate through each sequence state_ix = 0 last_state = 'S' for i in range(n): if i in match_cols: state_ix += 1 if seq[i] != '-': current_state = 'M' + str(state_ix) emissions.loc[current_state, seq[i]] += 1 else: current_state = 'D' + str(state_ix) transitions.loc[last_state, current_state] += 1 last_state = current_state elif seq[i] != '-': current_state = 'I' + str(state_ix) transitions.loc[last_state, current_state] += 1 emissions.loc[current_state, seq[i]] += 1 last_state = current_state transitions.loc[last_state, 'E'] += 1 # normalize rows transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3) emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3) return transitions, emissions
def scatter_pie_from_df( df: pd.DataFrame, x: str, y: str, cols: Optional[list] = [], normalize: bool = True, return_df: bool = False, palette: Optional[dict] = None, cmap: Optional[str] = "tab10", **kwargs, ) -> Axes: """ Plot scatter pie based on columns in a DataFrame. Parameters: df: Dataframe containing x, y, and additional count columns. x: Column to use as x-values. y: Column to use as y-values. cols: List of columns in dataframe to use as ratios and plotting. If [], uses all columns besides x and y. normalize: If True, calculate ratios using selected columns. return_df: If True, also return normalized dataframe. palette: Dictionary mapping column name to color. If None, create mapping using cmap. cmap: Name of colormap to use if palette not provided. kwargs: Arguments passed to :func:`scatter_pie` Returns: A :class:`~matplotlib.axes.Axes` and normalized df if `return_df` is True. """ # make copy of dataframe and set xy as index df = df.copy().set_index([x, y]) if (type(cols) == list) & (len(cols) > 1): # used specified list of columns df = df.loc[:, cols] elif cols != []: raise ValueError("cols must be a list of more than one column headers") # row normalize categories = df.columns df = df.div(df.sum(axis=1), axis=0).fillna(0) df = df.reset_index() # generate mapping of category to color if palette == None: palette = get_palette(categories, cmap) ratios = df[categories].to_records(index=False).tolist() colors = [palette[cat] for cat in categories] ax = scatter_pie(df[x].values, df[y].values, ratios, colors, **kwargs) # generate legend as separate figure if return_df: return ax, df return ax
def jevons_index( prices: pd.DataFrame, base_prices: pd.DataFrame, axis: int = 1, ) -> pd.Series: """Calculates an index using the Jevons method which takes the geometric mean of price relatives. """ price_relatives = prices.div(base_prices) return geo_mean(price_relatives, axis) * 100
def get_quality_adjusted_prices( prices: pd.DataFrame, base_prices: pd.DataFrame, adjustments: pd.DataFrame, axis: pd._typing.Axis = 1, ) -> pd.DataFrame: """Applies the quality adjustments to get new base prices.""" adjustment_factor = prices.div(prices - adjustments) return base_prices * adjustment_factor.cumprod(axis)
def carli_index( prices: pd.DataFrame, base_prices: pd.DataFrame, axis: int = 1, ) -> pd.Series: """Calculates an index using the Carli method which takes the mean of price relatives. """ price_relatives = prices.div(base_prices) return price_relatives.mean(axis) * 100
def laspeyres_index( prices: pd.DataFrame, base_prices: pd.DataFrame, weights: pd.DataFrame, axis: int = 1, ) -> pd.Series: """Calculates an index using the Laspeyres method which takes a sum of the product of the price relatives and weight shares. """ price_relatives = prices.div(base_prices) return aggregate(price_relatives, weights, axis=axis) * 100
def plot_lines(df: pd.DataFrame, normalize=False): if normalize: df = df.div(df.iloc[0], axis=1) fig = go.Figure() for col in df.columns: fig.add_trace(go.Scatter(x=df.index, y=df[col], name=col)) fig.update_layout(showlegend=True, xaxis={'hoverformat': '%d%b%Y'}, yaxis={'hoverformat': '.1%'}) fig.show()
def _clean_up(df: pd.DataFrame) -> pd.DataFrame: """Форматирование данных.""" df = df.transpose().stack() first_year = df.index[0][0] df.index = pd.date_range( name=col.DATE, freq="M", start=pd.Timestamp(year=first_year, month=1, day=END_OF_JAN), periods=len(df), ) df = df.div(100) return df.to_frame(col.CPI)
def geometric_laspeyres_index( prices: pd.DataFrame, base_prices: pd.DataFrame, weights: pd.DataFrame, axis: int = 1, ) -> pd.Series: """Calculates an index using the geometric Laspeyres method which takes the geometric mean of the price relatives multiplied by weight shares. """ price_relatives = prices.div(base_prices) index = aggregate(price_relatives, weights, method='geomean', axis=axis) return index * 100
def get_rolling_beta(df: pd.DataFrame, hist: pd.DataFrame, mark: pd.DataFrame, n: pd.DataFrame) -> pd.DataFrame: """Turns a holdings portfolio into a rolling beta dataframe Parameters ---------- df : pd.DataFrame The dataframe of daily holdings hist : pd.DataFrame A dataframe of historical returns mark : pd.DataFrame The dataframe of market performance n : int The period to get returns for Returns ---------- final : pd.DataFrame Dataframe with rolling beta """ df = df["Holding"] uniques = df.columns.tolist() res = df.div(df.sum(axis=1), axis=0) res = res.fillna(0) comb = pd.merge(hist["Close"], mark["Market"], how="outer", left_index=True, right_index=True) comb = comb.fillna(method="ffill") for col in hist["Close"].columns: exog = sm.add_constant(comb["Close"]) rols = RollingOLS(comb[col], exog, window=252) rres = rols.fit() res[f"beta_{col}"] = rres.params["Close"] final = res.fillna(method="ffill") for uni in uniques: final[f"prod_{uni}"] = final[uni] * final[f"beta_{uni}"] dropped = final[[f"beta_{x}" for x in uniques]].copy() final = final.drop(columns=[f"beta_{x}" for x in uniques] + uniques) final["total"] = final.sum(axis=1) final = final[final.index >= datetime.now() - timedelta(days=n + 1)] comb = pd.merge(final, dropped, how="left", left_index=True, right_index=True) return comb
def _get_proportional_weights(signal_df: pd.DataFrame, values_df: pd.DataFrame, inversely: bool) -> pd.DataFrame: """Assumes signal_df and values_df are two DataFrames with the same index and columns. inversely is bool and decides if the weights are proportional or inversely-proportional.""" if not dataframe_has_same_index_and_column_names(signal_df, values_df): raise ValueError( 'signal_df and values_df does not have the same composition.') values_df.iloc[:, 0] = np.nan if inversely: values_df = values_df.apply(lambda x: 1.0 / x) values_df *= signal_df values_sum_s = values_df.sum(axis=1) proportional_weight_df = values_df.div(values_sum_s, axis='index').fillna(value=0) return proportional_weight_df
def generate_probability_vector_result(output_path): cluster_frame = pd.read_csv(output_path + '/clusters.csv', header=None) cluster_frame = cluster_frame.set_index(cluster_frame.ix[:,0]).ix[:, 1:] cluster_array = cluster_frame.values points_frame = pd.read_csv(output_path + '/points.csv', header=None) # points_frame = points_frame.drop_duplicates() points_array = points_frame.values distance_matrix = pw.euclidean_distances(cluster_array, points_array) distance_matrix = distance_matrix.T distance_frame = DataFrame(distance_matrix) # print(distance_frame) # print(distance_frame.sum(axis=1)) distance_frame = distance_frame.div(distance_frame.sum(axis=1), axis=0) distance_frame.to_csv(output_path + '/probability.csv')
def generate_probability_vector_result(output_path): cluster_frame = pd.read_csv(output_path + '/clusters.csv', header=None) cluster_frame = cluster_frame.set_index(cluster_frame.ix[:, 0]).ix[:, 1:] cluster_array = cluster_frame.values points_frame = pd.read_csv(output_path + '/points.csv', header=None) # points_frame = points_frame.drop_duplicates() points_array = points_frame.values distance_matrix = pw.euclidean_distances(cluster_array, points_array) distance_matrix = distance_matrix.T distance_frame = DataFrame(distance_matrix) # print(distance_frame) # print(distance_frame.sum(axis=1)) distance_frame = distance_frame.div(distance_frame.sum(axis=1), axis=0) distance_frame.to_csv(output_path + '/probability.csv')
def winter_monthly(df: pd.DataFrame) -> pd.DataFrame: """Compute winter monthly deaths as a %age of all winter deaths.""" df = df.query(("Date >= '1 Jul 2020' and Date <= '30 Jun 2021'")) df = df.resample("M").sum() assert df["UK"].sum() == 95234 # quality check # convert to monthly percentage of total df = df.div(df.sum()) * 100 # data is to mid April 2021: pad remaining months to end of winter period with None idx = pd.to_datetime( [datetime(2021, 5, 31, 0, 0, 0), datetime(2021, 6, 30, 0, 0, 0)] ) null_data = pd.DataFrame(columns=["UK"], data=[None, None], index=idx) df = df.append(null_data) return df
def feature_statistics_per_class(features, targets, target_names, bins=5): from pandas import DataFrame from sklearn.preprocessing import LabelBinarizer from sklearn.feature_extraction import DictVectorizer import numpy as np binned_df = (features.div(features.max()) * bins).astype(int).astype(str) feature_dict = binned_df.to_dict(orient='records') dv = DictVectorizer() x = dv.fit_transform(feature_dict) y = LabelBinarizer().fit_transform(targets) feature_count_df = DataFrame(np.dot(x.T.todense(), y), columns=target_names, index=dv.get_feature_names()) feature_count_norm_df = feature_count_df.div( DataFrame(y, columns=target_names).sum()) return feature_count_norm_df
def get_quality_adjustments( quality_value: pd.DataFrame, to_reset: Optional[pd.DataFrame] = None, to_adjust: Optional[pd.DataFrame] = None, ) -> pd.DataFrame: """Return cumulative quality adjustment factors for given values. Accumulates the quality adjustments across each Feb-Jan+1 window, resetting back to no adjustment (a factor of 1) if a reset occurs. By default, adjustment factors are determined by dividing quality values by the value in the period before, but this can be subset using `to_adjust`_. Parameters ---------- quality_value : DataFrame The quality value used to calculate quality adjustments. to_reset : DataFrame Boolean mask of quality adjustments to be reset. to_adjust : DataFrame Boolean mask of values to be adjusted. Returns ------- DataFrame Cumulative adjustment factors for base prices. """ # Divide size by the period before. adjustment_factors = quality_value.div(quality_value.shift(1, axis=1)) if to_adjust is not None: adjustment_factors[~to_adjust] = 1 if to_reset is not None: # Get the inverse cumulative growth for resetting. impute_resets = get_cumulative_adjustments(adjustment_factors).pow(-1) adjustment_factors = adjustment_factors.mask(to_reset, impute_resets) # Fill data lost in first period with 1 i.e. no adjustment. return get_cumulative_adjustments(adjustment_factors).fillna(1)
def _axis_wise( df: pd.DataFrame, level: int, totals_name: str, subtotals_name: str, ndigits: int, unit: int, **kwargs ) -> pd.DataFrame: if level > 0: totals_name = subtotals_name if isinstance(df.index, pd.MultiIndex): totals = ( df.xs(totals_name, level=level, drop_level=False) .reindex(df.index) .bfill() ) else: totals = df.loc[totals_name] result = df.div(totals).multiply(unit) return result.pipe(round_percentages, ndigits=ndigits)
def _table_wise( df: pd.DataFrame, level: int, subtotals_name: str, ndigits: int, unit: int, **kwargs ) -> pd.DataFrame: if level == 0: totals = df.iloc[-1, -1] if df.index.nlevels > 1 or df.columns.nlevels > 1: frame = pd.DataFrame().reindex_like(df) frame.iloc[-1, -1] = totals totals = frame.bfill().bfill(axis=1) else: totals = ( df.xs(subtotals_name, level=level, drop_level=False) .xs(subtotals_name, axis=1, level=level, drop_level=False) .reindex_like(df).bfill().bfill(axis=1) ) result = df.div(totals).multiply(unit) return result.pipe(round_percentages, ndigits=ndigits)
def _table_wise_multilevel( df: pd.DataFrame, axlevels: Any, totals_name: str, subtotals_name: str, ndigits: int, unit: int, **kwargs ) -> pd.DataFrame: axlevels = [min(level) for level in axlevels] row_totals = totals_name if axlevels[0] == 0 else subtotals_name col_totals = totals_name if axlevels[1] == 0 else subtotals_name totals = ( df.xs(row_totals, level=axlevels[0], drop_level=False) .xs(col_totals, axis=1, level=axlevels[1], drop_level=False) .reindex_like(df).bfill().bfill(axis=1) ) result = df.div(totals).multiply(unit) return result.pipe(round_percentages, ndigits=ndigits)
def dataFrameMathTest(): #Note : The methods that return a series default to working on columns. df = DataFrame() # Load a DataFrame from a CSV file org_df = pd.read_csv('mlg.csv') df = org_df.iloc[:,1:7] resAbs = df.abs() # absolute values print(resAbs) #resAdd = df.add(o) # add df, Series or value #print(resAdd) resCount = df.count() # non NA/null values print(resCount) resCumMax = df.cummax() # (cols default axis) print(resCumMax) resCumMin = df.cummin() # (cols default axis) print(resCumMin) resCumSum = df.cumsum() # (cols default axis) print(resCumSum) resDiff = df.diff() # 1st diff (col def axis) print(resDiff) resDiv = df.div(12) # div by df, Series, value print(resDiv) #resDot = df.dot(13) # matrix dot product #print(resDot) resMax = df.max() # max of axis (col def) print(resMax) resMean = df.mean() # mean (col default axis) print(resMean) resMedian = df.median()# median (col default) print(resMedian) resMin = df.min() # min of axis (col def) print(resMin) resMul = df.mul(2) # mul by df Series val print(resMul) resSum = df.sum() # sum axis (cols default) print(resSum) resWhere = df.where(df > 0.5, other=np.nan) print(resWhere)
def compute_percentages(data_: pd.DataFrame): percentages = data_.div(data_.iloc[:, -1], axis=0) percentages.replace([np.inf, -np.inf, np.nan], 0, inplace=True) return percentages
matrices of transition and emission probabilities. ''' from pandas import DataFrame from io import StringIO f = open('rosalind_ba10h.txt').read().rstrip().split('--------\n') x = list(f[0].rstrip()) alphabet = f[1].rstrip().split() path = list(f[2].rstrip()) states = f[3].rstrip().split() transitions = DataFrame(data=0.0, index=states, columns=states) emissions = DataFrame(data=0.0, index=states, columns=alphabet) for t in zip(path[:-1], path[1:]): transitions.loc[t] += 1 for a in zip(path, x): emissions.loc[a] += 1 transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3) emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3) f = StringIO() transitions.to_csv(f, sep='\t', float_format='%g') f.write('--------\n') emissions.to_csv(f, sep='\t', float_format='%g') open('rosalind_ba10h_sub.txt', 'wt').write(f.getvalue().rstrip())
def initial_setup( self, iot_p: pd.DataFrame, dtilde_iot: pd.DataFrame, ytilde_iot: pd.DataFrame, p_tau: float, substitution_rate: float, ) -> None: """ One-time setup of the GDP model :param iot_p: primary input-output data :param dtilde_iot: intermediate input-output data :param ytilde_iot: final input-output data :param p_tau: tax rate on products and production (fraction of pre-crisis levels) :param substitution_rate: Rate at which capital can be substituted for labour and vice versa """ self.iot_p = iot_p self.dtilde_iot = dtilde_iot self.ytilde_iot = ytilde_iot self.xtilde_iot = pd.concat( [ iot_p[PrimaryInput.IMPORTS], iot_p[PrimaryInput.COMPENSATION], iot_p[[ PrimaryInput.FIXED_CAPITAL_CONSUMPTION, PrimaryInput.NET_OPERATING_SURPLUS, ]].sum(axis=1), ], axis=1, ).T self.xtilde_iot.index = M # x~[M.K, T] == 0, so we add a small epsilon self.xtilde_iot = np.maximum(self.xtilde_iot, 1e-6) self.ytilde_total_iot = self.ytilde_iot.sum(axis=1) self.gamma_d = dtilde_iot.div( dtilde_iot.sum(axis=0) + self.xtilde_iot.sum(axis=0)) self.gamma_x = self.xtilde_iot.div( dtilde_iot.sum(axis=0) + self.xtilde_iot.sum(axis=0)) self.o_iot = iot_p[[ PrimaryInput.TAXES_PRODUCTION, PrimaryInput.TAXES_PRODUCTS ]].T self.q_iot = dtilde_iot.sum(axis=0) + iot_p.sum(axis=1) assert np.allclose( (dtilde_iot.sum(axis=0) + iot_p.sum(axis=1)), (dtilde_iot.sum(axis=1) + self.ytilde_total_iot), rtol=1e-6, ) # errors are due to rounding and omission of household sector assert np.allclose( (dtilde_iot.sum(axis=0) + self.xtilde_iot.sum(axis=0) + self.o_iot.sum(axis=0)), (dtilde_iot.sum(axis=1) + self.ytilde_total_iot), rtol=1e-6, ) # errors are due to rounding and omission of household sector assert np.allclose(self.gamma_d.sum(axis=0) + self.gamma_x.sum(axis=0), 1.0, atol=1e-9) assert (self.gamma_d >= 0).all().all() assert (self.gamma_x >= 0).all().all() # depends on p_tau cd_prod_fun = dtilde_iot.pow(self.gamma_d).prod( axis=0) * self.xtilde_iot.pow(self.gamma_x).prod(axis=0) min_prod_fun = pd.concat( [ dtilde_iot.multiply(1 / self.gamma_d).min(), self.xtilde_iot.multiply(1 / self.gamma_x).min(), ], axis=1, ).min(axis=1) sum_prod_fun = (dtilde_iot.multiply(self.gamma_d).sum() + self.xtilde_iot.multiply(self.gamma_x).sum()) lin_prod_fun = (1 - substitution_rate ) * min_prod_fun + substitution_rate * sum_prod_fun prod_fun = lin_prod_fun self.Lambda = (1 / (1 - (self.o_iot.sum(axis=0) / self.q_iot) * p_tau) * (dtilde_iot.sum(axis=0) + self.xtilde_iot.sum(axis=0)) / prod_fun) self.gamma_d_dict = {(i, j): self.gamma_d.loc[i, j] for i in Sector for j in Sector} self.gamma_x_dict = {(m, j): self.gamma_x.loc[m, j] for m in M for j in Sector} self.Lambda_dict = {i: self.Lambda[i] for i in Sector} weight_taxes = { i: p_tau * self.o_iot.loc[PrimaryInput.TAXES_PRODUCTION, i] / self.q_iot[i] for i in Sector } self.gdp_per_sector = { i: self.indicator("xtilde", M.L, i) + self.indicator("xtilde", M.K, i) + self.indicator("q", i) * weight_taxes[i] for i in Sector } self.surplus_per_sector = { i: self.indicator("xtilde", M.K, i) for i in Sector # households don't have capital input to production } self.objective_c = -np.sum(list( self.gdp_per_sector.values()), axis=0) - np.sum( list(self.surplus_per_sector.values()), axis=0) assert self.objective_c.shape[0] == len(self.variables) self.max_gdp_per_sector = ( self.xtilde_iot.loc[M.L] + self.xtilde_iot.loc[M.K] + self.o_iot.loc[PrimaryInput.TAXES_PRODUCTION]) self.max_gdp = self.max_gdp_per_sector.sum() self.c_production_function_lin(self.gamma_d_dict, self.gamma_x_dict, self.Lambda_dict, substitution_rate) self.c_input(self.o_iot, self.q_iot, p_tau) self.c_output(self.q_iot)
def add_shares(table: pd.DataFrame): """Добавляет к таблице долю инвесторов в портфеле.""" share = table.div(table["Portfolio"], axis="index") share.index = ["%"] * len(share) return pd.concat([table, share])
columns=list('abc')) frame2 = DataFrame(np.arange(1,10).reshape(3,3), columns=list('abc')) print(frame1) print(frame2) # frame 덧셈 add = frame1.add(frame2) print(add) # frame 뺄셈 sub = frame2.sub(frame1) print(sub) # frame 나눗셈 div = frame2 / frame1 div = frame2.div(frame1) print(div) # inf : 부모가 0인 경우 # frame 곱셈 mul = frame1.mul(frame2) print(mul) # 행/열 단위 합계/평균/최댓값/최솟값 sum1 = mul.sum(axis = 1) # 행 단위 sum2 = mul.sum(axis = 0) # 열 단위 print('행 단위 합계:\n',sum1) print('열 단위 합계:\n',sum2) avg1 = mul.mean(axis = 1) # 행 단위 평균