def inner_join(df, join: pd.DataFrame, prefix: str = '', prefix_left='', force_multi_index=False, ffill=False): if df is None: if force_multi_index: if isinstance(join.columns, pd.MultiIndex): return join else: return add_multi_index(join, prefix) else: return join if force_multi_index: if not isinstance(df.columns, pd.MultiIndex) and len(df.columns) > 0: if len(prefix_left) <= 0: raise ValueError("You need to provide a prefix_left") else: df = add_multi_index(df, prefix_left) if isinstance(df.columns, pd.MultiIndex) and not isinstance(join.columns, pd.MultiIndex): b = join.copy() b.columns = pd.MultiIndex.from_product([[prefix], b.columns]) if ffill: return pd\ .merge(df, b, left_index=True, right_index=True, how='outer', sort=True)\ .fillna(method='ffill')\ .dropna() else: return pd.merge(df, b, left_index=True, right_index=True, how='inner', sort=True) else: if ffill: return pd\ .merge(df.add_prefix(prefix_left), join.add_prefix(prefix), left_index=True, right_index=True, how='outer', sort=True)\ .fillna(method='ffill')\ .dropna() else: return pd.merge(df.add_prefix(prefix_left), join.add_prefix(prefix), left_index=True, right_index=True, how='inner', sort=True)
def merge_and_displace_frames( substrate: pd.DataFrame, reference: pd.DataFrame, pipette: pd.DataFrame, experiment_duration: pd.Timedelta, duration_of_resampled_row: pd.Timedelta, ): reference = reference.add_prefix("Reference_") substrate = substrate.add_prefix("Substrate_") pipette = pipette.add_prefix("Pipette_") # Convert from frame numbers to the actual time through the experiment for df, name in [ (reference, "Reference"), (substrate, "Substrate"), (pipette, "Pipette"), ]: df: pd.DataFrame name: str number_of_frames = df[f"{name}_Frame"].max() instant = df[f"{name}_Frame"] / number_of_frames * experiment_duration # pd.Timedelta -> seconds df["Instant"] = instant df.set_index("Instant", inplace=True) combined = pd.concat( (reference, substrate, pipette), axis= "columns", # We want to join two tables so that the columns are the joining point (i.e left and right) ) # In order to compare results between experiments, we must now resample them (so that each row has a common `Instant`) # This is a lossy operation. We choose to take the mean combined: pd.DataFrame = combined.resample( rule=duration_of_resampled_row).mean() # Frame numbers are no longer valid combined.drop( columns=[col for col in combined.columns if col.endswith("Frame")], inplace=True, ) logger.info( f"Resampled to buckets of {duration_of_resampled_row} ({len(combined)} rows)" ) combined["X_Delta"] = (combined["Substrate_X_Position"] - combined["Reference_X_Position"]) combined["Y_Delta"] = (combined["Substrate_Y_Position"] - combined["Reference_Y_Position"]) # Make our delta lines start at 0 x_start = combined["X_Delta"].iloc[0] y_start = combined["Y_Delta"].iloc[0] combined["X_Delta"] = combined["X_Delta"] - x_start combined["Y_Delta"] = combined["Y_Delta"] - y_start return combined
def slide_17(): df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) print pd.get_dummies(df['key']) dummies = pd.get_dummies(df['key'], prefix='key') print dummies df_with_dummy = df[['data1']].join(dummies) print df_with_dummy mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table(MOVIELENSPATH, sep='::', header=None, engine='python', names=mnames) print movies[:10] genre_iter = (set(x.split('|')) for x in movies.genres) genres = sorted(set.union(*genre_iter)) print genres dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres) for i, gen in enumerate(movies.genres): dummies.ix[i, gen.split('|')] = 1 movies_windic = movies.join(dummies.add_prefix('Genre_')) print movies_windic.ix[0] values = np.random.rand(10) print values bins = [0, 0.2, 0.4, 0.6, 0.8, 1] print pd.get_dummies(pd.cut(values, bins))
def compute_anomalous_events(df_prices: pd.DataFrame, df_bollinger: pd.DataFrame): """Compute anomalous (high or low) price events for a set of stocks over time.""" df = pd.concat([df_prices, df_bollinger.add_prefix("bol_")], axis=1) df["event"] = pd.Series(pd.NA, index=df.index, dtype=EVENT_TYPE) df["event"][df["close"] > df["bol_upper"]] = "high" # type: ignore df["event"][df["close"] < df["bol_lower"]] = "low" # type: ignore return df[df["event"].notna()][["name", "date", "event"]].reset_index()
def predict(self, X: pd.DataFrame) -> pd.Series: self._check_is_fitted() if set(self.features_) != set(X.columns): raise ValueError( f"Feature sets do not match: [{self.features_}, {X.columns}]" ) data = X.add_prefix(__class__._X_COL_PREFIX) preds = map(self._predict_row, data.iterrows()) return pd.Series(preds, index=data.index)
def test_add_prefix(): npr = np.array([[20.2, 2.0, 3.2, 4.3, 5.5], [10, -20, -30, -40, -50], [36.2, 13.2, 16.4, 12.2, 10.8]]) pdf = DataFrame(npr) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf) prefix = "item_" cn_tb_with_prefix = cn_tb.add_prefix(prefix) pdf_with_prefix = pdf.add_prefix(prefix) assert pdf_with_prefix.columns.tolist() == cn_tb_with_prefix.column_names
def combine_sewershed_polygon_sample( self, df: pd.DataFrame, polygon: pd.DataFrame) -> pd.DataFrame: if polygon.empty: return df elif df.empty: return polygon polygon = polygon.copy() polygon = polygon.add_prefix("Sewershed-") return pd.merge(df, polygon, how="left", left_on="Site_polygonID", right_on="Sewershed-Polygon_polygonID")
def combine_cphd_polygon_sample(self, df: pd.DataFrame, polygon: pd.DataFrame) -> pd.DataFrame: if polygon.empty: return df elif df.empty: return polygon polygon = polygon.copy() polygon = polygon.add_prefix("CPHD-") return pd.merge(df, polygon, how="left", left_on="Calculated_polygonIDForCPHD", right_on="CPHD-Polygon_polygonID")
def dummy02(): mnames=['movies_id','title','genres'] movies=pd.read_table(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch02\movielens\movies.dat', sep='::',header=None,names=mnames) print movies[:10] genre_iter=(set(x.split('|')) for x in movies.genres) genres=sorted(set.union(*genre_iter)) print genres dummies=DataFrame(np.zeros((len(movies),len(genres))),columns=genres) for i,gen in enumerate(movies.genres): dummies.ix[i,gen.split('|')]=1 movies_windic=movies.join(dummies.add_prefix('Genre_')) print movies_windic.ix[0]
def test(path=None): data = pd.read_csv('ch08/Haiti.csv') data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) & (data.LONGITUDE > -75) & (data.LONGITUDE < -70) & data.CATEGORY.notnull()] # 得到所有分类 all_cats = get_all_categories(data.CATEGORY) # 将分类编号和分类信息组成 dict english_mapping = dict(get_english(x) for x in all_cats) # 或得所有分类 code all_codes = get_code(all_cats) # 得到一个 index 对象,使用分类的编号 code_index = pd.Index(np.unique(all_codes)) # index 是原来 data 的 index # columns 是分类的编号 dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index) # row: index # cat: category # 每条记录中有那个分类就置 1 for row, cat in zip(data.index, data.CATEGORY): codes = get_code(to_cat_list(cat)) dummy_frame.ix[row, codes] = 1 data = data.join(dummy_frame.add_prefix('category_')) fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10)) fig.subplots_adjust(hspace=0.05, wspace=0.05) to_plot = ['2a', '1', '3c', '7a'] lllat = 17.25 urlat = 20.25 lllon = -75 urlon = -71 for code, ax in zip(to_plot, axes.flat): m = basic_haiti_map(ax, lllat=lllat, urlat=urlat, lllon=lllon, urlon=urlon) cat_data = data[data['category_%s' % code] == 1] x, y = m(cat_data.LONGITUDE, cat_data.LATITUDE) m.plot(x, y, 'k.', alpha=0.5) ax.set_title('%s: %s' % (code, english_mapping[code]))
def Preprocess(movies, ratings): print 'Data preprocessing...' # Convert the column of "genres" into dummies genre_iter = (set(x.split('|')) for x in movies.genres) genres = sorted(set.union(*genre_iter)) # get all genres dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres) # iterate "genres" each row, assign "1" to corresponding location for i, gen in enumerate(movies.genres): dummies.ix[i, gen.split('|')] = 1 movies_dummies = movies.join(dummies.add_prefix('Genres_')) # merged with movies data_merged = pd.merge(movies_dummies, ratings, on='movieid') # merge two tables by 'movieid' data = data_merged.drop_duplicates() # drop duplications data = data.dropna() # drop na print 'Preprocessing completed.' return data
def inner_join(df, join: pd.DataFrame, prefix: str = ''): if isinstance(df.columns, pd.MultiIndex) and not isinstance( join.columns, pd.MultiIndex): b = join.copy() b.columns = pd.MultiIndex.from_product([[prefix], b.columns]) return pd.merge(df, b, left_index=True, right_index=True, how='inner', sort=True) else: return pd.merge(df, join.add_prefix(prefix), left_index=True, right_index=True, how='inner', sort=True)
def movie_genres(movies): #设置哑变量 genre_iter = (set(x.split('|')) for x in movies.genres) genres = sorted(set.union(*genre_iter)) dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres) #迭代movies的genres列,将其对应类型的位置设置为1 for i, gen in enumerate(movies.genres): dummies.ix[i, gen.split('|')] = 1 #得到合并后的movies movies_windic = movies.join(dummies.add_prefix('Genres_')) movies_windic = movies_windic.drop('Genres_IMAX', axis=1) #对每种类型的电影进行统计,绘出扇形图 colors = [ 'Blue', 'RoyalBlue', 'MediumBlue', 'DodgerBlue', 'CornflowerBlue', 'DeepSkyBlue', 'SkyBlue', 'Azure', 'SlateBlue', 'LightBlue', 'PaleTurquoise', 'DarkCyan', 'DarkSlateBlue', 'LightSkyBlue', 'MediumTurquoise', 'Navy', 'SteelBlue', 'MidnightBlue', 'PowderBlue' ] movies_windic.ix[:, 5:].sum().plot(kind='pie', title='The Pie of Genres', colors=colors) return movies_windic
def portfolio_analyzer(weights: dict, pnl: pd.DataFrame, returns: pd.DataFrame, factor_betas: pd.DataFrame, factor_alphas: pd.DataFrame, n_days_delay: float): assert isinstance(weights, dict), f"{weights} must be a dictionary of pandas Series" w_daily = {} all_factors = {} factors_returns = {} factor_exp = {} factor_returns_models = {} S_idiosyncratic_returns = pd.Series(dtype=float, name='idiosyncratic_returns') dates_lst = list(weights.keys()) for i, key_str_dt in enumerate(dates_lst[:-n_days_delay]): key_dt = dt.datetime.strptime(key_str_dt, "%Y%m%d") key_tzaw_rets_dt = pd.Timestamp( dates_lst[i + n_days_delay]).tz_localize(tz="utc") # weights val_w = weights[key_str_dt] w_daily[key_dt] = val_w # factors B_alpha = factor_alphas.loc[key_dt].add_prefix('alpha_') B_beta = factor_betas.add_prefix('beta_') B_all_factors = B_alpha.join(B_beta) # static betas all_factors[key_dt] = B_all_factors # returns returns_day = returns.loc[key_tzaw_rets_dt] returns_day.name = 'returns' returns_day.index.name = 'asset' # Compute factor returns f(t)[i]: r(t+n)[i] = b(t)[i]*f(t)[i] + s(t)[i] as reg coefs exog = B_all_factors.filter(regex='(^alpha|^beta)').fillna(0.0) endog = returns_day[exog.index].fillna(0.0) model = sm.OLS(endog, exog) res = model.fit() factor_returns_models[key_dt] = res factors_returns[key_dt] = res.params # Compute factor exposure e(t)[i] = w(t)[i]*f(t)[i] factor_exp[key_dt] = get_factor_exposures(factor_betas=exog, weights=val_w) # Compute idiosyncratic returns: s(t)[i]*w(t)[i] S_idiosyncratic_returns[key_dt] = partial_dot_product(v=res.resid, w=val_w) w_opt_df = pd.concat(w_daily) w_opt_df.index.names = ['date', 'asset'] w_opt_df.name = 'w_opt' B_factors_asset_df = pd.concat(all_factors) B_factors_asset_df.index.names = ['date', 'asset'] f_factors_returns_df = pd.concat(factors_returns).unstack() f_factors_returns_df.index.name = 'date' E_factors_exp_df = pd.concat(factor_exp).unstack() E_factors_exp_df.index.name = 'date' S_idiosyncratic_returns.index.name = 'date' pnl_and_w_df = join_weights_and_pnl(w_opt_df, pnl, returns) return w_opt_df, pnl_and_w_df, B_factors_asset_df, f_factors_returns_df, E_factors_exp_df, S_idiosyncratic_returns
# Read data from standard input on the command line sys.stdin = os.fdopen(sys.stdin.fileno(), "rU") data = pd.read_csv(sys.stdin) # Restrict to data in Haiti with categories data = data[ (data.LATITUDE > 18) & (data.LATITUDE < 20) & (data.LONGITUDE > -75) & (data.LONGITUDE < -70) & data.CATEGORY.notnull() ] # Extract categorizations all_cats = get_all_categories(data.CATEGORY) # Add indicator columns for categories all_codes = get_code(all_cats) code_index = pd.Index(np.unique(all_codes)) dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index) for row, cat in zip(data.index, data.CATEGORY): codes = get_code(to_cat_list(cat)) dummy_frame.ix[row, codes] = 1 data = data.join(dummy_frame.add_prefix("category_")) # Write data to standard output data.to_csv(sys.stdout)
#下面要根据分类选取记录 #添加指标列 #先抽取出唯一的分类编码 def get_code(seq): return [x.split('.')[0] for x in seq if x] all_codes = get_code(all_cats) #索引化 code_index = pd.Index(np.unique(all_codes)) #构造一个新的DataFrame dummy_frame = DataFrame(np.zeros((len(data),len(code_index))),index= data.index,columns = code_index) #绘制海地地图(Basemap库无法导入) for row,cat in zip(data.index,data.CATEGORY): codes = get_code(to_cat_list(cat)) dummy_frame.ix[row,codes] = 1 data = data.join(dummy_frame.add_prefix('category_')) def basic_haiti_map(ax=None,lllat = 17.25, urlat = 20.25, lllon = -75, urlon = -71): m = Basemap(ax = ax, projection = 'stere',lon_0 = (urlon + lllon) / 2,lat_0 = (urlat + lllat) /2 , llcrnrlat = lllat, urcrnrlat = urlat, llcrnrlon = lllon, urcrnrlon = urlon,resolution = 'f') m.drawcoastlines() m.drawstates() m.drawcountries() return m fig,axes = plt.subplots(nrows = 2,ncols = 2,figsize = (12,10)) fig.subplots_adjust(hspace = 0.05,wspace = 0.05) to_plot = ['2a','1','3c','7a'] lllat = 17.25;urlat = 20.25;lllon = -75;urlon = -71 for code,ax in zip(to_plot,axes.flat): m = basic_haiti_map(ax,lllat=lllat,urlat = urlat, lllon = lllon,urlon = urlon) cat_data = data[data['category_%s' % code] == 1] x,y = m(cat_data.LONGITUDE,cat_data.LATITUDE)
all_codes = get_code(all_cats) code_index = pd.Index(np.unique(all_codes)) dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index) # <codecell> dummy_frame.ix[:, :6] # <codecell> for row, cat in zip(data.index, data.CATEGORY): codes = get_code(to_cat_list(cat)) dummy_frame.ix[row, codes] = 1 data = data.join(dummy_frame.add_prefix('category_')) # <codecell> data.CATEGORY.isnull().value_counts() # <codecell> from mpl_toolkits.basemap import Basemap import matplotlib.pyplot as plt def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25, lllon=-75, urlon=-71): # create polar stereographic Basemap instance. m = Basemap(ax=ax, projection='stere', lon_0=(urlon + lllon) / 2,
def _compose_data(cls, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: data = X.add_prefix(cls._X_COL_PREFIX) # makes a copy data[cls._Y_COL_PREFIX] = y return data
def _get_next_state( state: pd.DataFrame, seperation: float, cohesion: float, alignment: float, visibility: float, dimensions: tp.List[str], step: float, ) -> pd.DataFrame: # Self-cross-product Boids for all (center, neighbor) pairs. state["i"] = range(len(state)) state["j"] = 0 pairs = pd.merge( left=state, right=state.add_prefix(prefix="n"), left_on="j", right_on="nj", how="outer", ) # Unpack columns. cols = [ ( f"p{i}", # Positions f"v{i}", # Velocitys f"np{i}", # Neighbor positions. f"nv{i}", # Neighbor velocitys. f"nd{i}", # Neighbor distances. ) for i in dimensions ] p, v, np, nv, nd = map(list, zip(*cols)) # For each dimension: for pi, npi, ndi in zip(p, np, nd): # Compute neighbor-to-center translations. pairs[ndi] = pairs[pi] - pairs[npi] # Compute neighbor-to-center distances. ndmag = pairs[nd].pow(2).sum(axis=1).pow(0.5) # Subset pairs to visible neighbors. pairs = pairs.loc[ndmag.le(visibility)] # For each dimension: for ndi in nd: # Transform neighbor-to-center translations to repulsions. pairs[ndi] /= ndmag.pow(2) # Compute neighbor velocity magnitudes. nvmag = pairs[nv].pow(2).sum(axis=1).pow(0.5) # For each dimension: for nvi in nv: # Transform neighbor velocities to (unit) neighbor directions. pairs[nvi] /= nvmag pairs[nvi].where(cond=nvmag.gt(0), other=0, inplace=True) # Nullify neighbors that are centers. pairs.loc[pairs["i"] == pairs["ni"], [*np, *nv, *nd]] = None # Augment repulsor behaviour. centers = pairs["t"].eq("repulsor") pairs.loc[centers, np] = None pairs.loc[centers, nv] = None pairs.loc[centers, nd] = None neighbors = pairs["nt"].eq("repulsor") pairs.loc[neighbors, np] = None pairs.loc[neighbors, nv] = None pairs.loc[neighbors, nd] *= 30 # Aggregate neighbor information per center Boid. agg_last = {col: "last" for col in ("t", *p, *v)} agg_mean = {col: "mean" for col in (*np, *nv, *nd)} agg = {**agg_last, **agg_mean} groups = pairs.groupby(by="i", as_index=False, sort=False) state = groups.agg(func=agg).drop(columns="i") # For each dimension: for pi, npi in zip(p, np): # Transform mean-neighbor positions to center-to-mean-neighbor translations. state[npi] -= state[pi] # For each dimension: for pi, vi, npi, nvi, ndi in zip(p, v, np, nv, nd): # Compute accelerations. ai = 0 ai += seperation * state.pop(ndi).where(cond=pd.notnull, other=0) ai += cohesion * state.pop(npi).where(cond=pd.notnull, other=0) ai += alignment * state.pop(nvi).where(cond=pd.notnull, other=0) # Update velocities and positions. state[vi] += ai * step**2 state[pi] += state[vi] * step return state
class DimRed: """ Class for quadruple dimension reduction. """ def __init__(self, x, w, p): """ Dimension reduction class. Parameters: x: Input matrix (np array) w: Weights to adjustment for ae p: latent dimension for ae """ self.x = x self.w = w self.p = p self.pca = None self.nmf = None self.ae = None self.__reduced = None self.__pcanmf = None self.__median = None self.__ael1 = None self.__ael2 = None self.__a = None self.__r = 15 self.__a1 = 0.03 self.__a2 = 0.85 self.__scorer = metrics.explained_variance_score self.__run_id = name_generator(6) def __str__(self): return "Quadruple dimension reduction class" def __repr__(self): return "\n" + self.__str__() def __get_score(self, model, y): """ Determine level of explained variance """ prediction = model.inverse_transform(model.transform(y)) return self.__scorer(y, prediction) def l_med(self): self.__median = pd.DataFrame(np.median(self.x.T, axis=1)) self.__median = self.__median.add_prefix('MEDIAN_' + self.__run_id + '_') return def lde(self): """ Decompose with PCA and NMF """ self.pca = PCA(n_components=0.95) self.pca.fit(self.x) pc_weights = pd.DataFrame(self.pca.components_.T) pc_weights = pc_weights.add_prefix('PCA_' + self.__run_id + '_') opti_rank = [] # warnings.filterwarnings('ignore') for k in range(2, self.__r): nmf = NMF(n_components=k, max_iter=1000).fit(self.x) score_it = self.__get_score(nmf, self.x) opti_rank.append(score_it) if score_it >= 0.95: break self.nmf = NMF(n_components=len(opti_rank) + 1, max_iter=10000) self.nmf.fit(self.x) warnings.resetwarnings() # nmf_weights = pd.DataFrame(self.nmf.components_.T) nmf_weights = nmf_weights.add_prefix('NMF_' + self.__run_id + '_') self.__pcanmf = pd.concat([pc_weights, nmf_weights], axis=1) return def __de4ae(self, y): """ Estimate optimal dimension for AE, based on Bahadur and Paffenroth 2020, IEEE """ s_x = y.copy() for t in range(s_x.shape[0]): s_x.iloc[t, :] = np.sort(np.array(s_x.iloc[t, :]))[::-1] svp = np.sort(s_x.mean())[::-1] svp_sum = svp.sum() alg1 = sum(svp / svp_sum > self.__a1) alg2 = 0 temp = (svp_sum * self.__a2) / 1 temp2 = 0 for i in range(len(svp)): temp2 += svp[i] alg2 += 1 if temp2 >= temp: break return int((alg1 + alg2) / 2) def __aer(self, nc): """ Build model structure """ input_layer = Input(shape=(nc, ), name="input") encoder = Dense(self.__a, activation="relu", kernel_initializer="glorot_uniform", activity_regularizer=regularizers.l1_l2(1e-16, 1e-9), name="enco1")(input_layer) encoder = Dense(self.__a // 2, activation="relu", name="code")(encoder) decoder = Dense(self.__a, activation="sigmoid", name="deco1")(encoder) decoder = Dense(nc, activation="sigmoid", name="output")(decoder) self.ae = Model(inputs=input_layer, outputs=decoder) self.ae.compile(optimizer=optimizers.RMSprop(learning_rate=1e-3), loss='mean_squared_error', metrics=['mse']) return def mud_ae(self): """ Model training and output. """ nam = self.__run_id + "_model.h5" if self.p == 0: self.__a = self.x.shape[0] * 50 // 100 check1 = ModelCheckpoint(filepath="est_" + nam, verbose=0, save_best_only=True) self.__aer(nc=self.x.shape[1]) self.ae.fit(self.x, self.x, sample_weight=self.w, epochs=500, shuffle=True, batch_size=15, validation_data=(self.x, self.x), callbacks=[check1], verbose=0) estimate = load_model("est_" + nam) code_est = Dff(estimate.get_layer("enco1").get_weights()[0]) self.__a = self.__de4ae(code_est) print("The optimal number of dimension is {}".format(self.__a)) else: self.__a = self.p check = ModelCheckpoint(filepath=nam, verbose=0, save_best_only=True) self.__aer(nc=self.x.shape[1]) self.ae.fit(self.x, self.x, sample_weight=self.w, epochs=3000, shuffle=True, batch_size=15, validation_data=(self.x, self.x), callbacks=[check], verbose=0) final = load_model(nam) self.__ael1 = Dff(final.get_layer("enco1").get_weights()[0]) self.__ael1 = self.__ael1.add_prefix('AE_' + self.__run_id + '_') self.__ael2 = Dff(final.get_layer("code").get_weights()[0]) self.__ael2["run"] = nam self.__ael2.to_csv("code_{}.csv".format(self.__run_id), index=False) return def fit(self): """ Fit quadruple dimension reduction {Median, PCA, NMF, AE[DE]} """ self.l_med() self.lde() self.mud_ae() self.__reduced = pd.concat([self.__ael1, self.__pcanmf, self.__median], axis=1) return def get_reduced(self): """ Get reduced dimension """ return self.__reduced def get_aede(self): return self.__a def add_reduced_row(self, y): self.__reduced["ID"] = y self.__pcanmf["ID"] = y self.__ael1["ID"] = y return
print pd.get_dummies(df['key']) # 给DataFrame的列加上前缀,方便合并 dummies = pd.get_dummies(df['key'], prefix='key') df_with_dummy = df[['data1']].join(dummies) print df_with_dummy # 某一行同属于多个分类 mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames) print movies[:10] # 数据规整 genre_iter = (set(x.split('|')) for x in movies.genres) # 抽取出不同的值 genres = sorted(set.union(* genre_iter)) print genre_iter print genres # 构建全0的DataFrame dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres) for i, gen in enumerate(movies.genres): dummies.ix[i, gen.split('|')] = 1 movies_windic = movies.join(dummies.add_prefix('Genre_')) print movies_windic.ix[0] # 结合get_dummies和诸如cut之类离散化函数 values = np.random.rand(10) print values bins = [0, 0.2, 0.4, 0.6, 0.8, 1] print pd.get_dummies(pd.cut(values, bins))
def red_flags(test_file: pd.DataFrame): # Difference claims and policy effect/emission test_file = test_file[[ 'id_siniestro', 'fecha_diferencia_siniestro_efecto', 'fecha_diferencia_siniestro_efecto_5', 'fecha_diferencia_siniestro_efecto_15', 'fecha_diferencia_siniestro_efecto_30', 'fecha_diferencia_siniestro_emision', 'fecha_diferencia_siniestro_emision_5', 'fecha_diferencia_siniestro_emision_15', 'fecha_diferencia_siniestro_emision_30', 'fecha_siniestro_ocurrencia', 'fecha_poliza_emision', 'fecha_poliza_efecto_natural', 'fecha_diferencia_siniestro_comunicacion' ]] policy_file = pd.read_csv(STRING.poliza_input_prediction, sep=',', encoding='utf-8', quotechar='"') policy_file = policy_file[[ 'audit_siniestro_referencia', 'poliza_cod_intermediario' ]] policy_file = policy_file.rename( columns={ 'audit_siniestro_referencia': 'id_siniestro', 'poliza_cod_intermediario': 'id_mediador' }) policy_file['id_siniestro'] = policy_file['id_siniestro'].map(int) test_file['id_siniestro'] = test_file['id_siniestro'].map(int) test_file = pd.merge(test_file, policy_file, how='left', on='id_siniestro') test_file = test_file.dropna(subset=['id_siniestro']) test_file['id_mediador'] = test_file['id_mediador'].fillna(-1) # Occurance between effect and emision for i in [ 'fecha_siniestro_ocurrencia', 'fecha_poliza_emision', 'fecha_poliza_efecto_natural' ]: test_file[i] = pd.to_datetime(test_file[i], format='%Y-%m-%d', errors='coerce') test_file['fecha_ocurrencia_entre_efecto_emision'] = pd.Series( 0, index=test_file.index) test_file.loc[(test_file['fecha_poliza_emision'] <= test_file['fecha_siniestro_ocurrencia']) & (test_file['fecha_siniestro_ocurrencia'] <= test_file['fecha_poliza_efecto_natural']), 'fecha_ocurrencia_entre_efecto_emision'] = 1 test_file.loc[(test_file['fecha_poliza_efecto_natural'] <= test_file['fecha_siniestro_ocurrencia']) & (test_file['fecha_siniestro_ocurrencia'] <= test_file['fecha_poliza_emision']), 'fecha_ocurrencia_entre_efecto_emision'] = 1 # Comunication and occurance difference test_file['retraso_comunicacion'] = pd.Series(0, index=test_file.index) test_file.loc[test_file['fecha_diferencia_siniestro_comunicacion'] >= 15, 'retraso_comunicacion'] = 1 # If mediador test_file['mediador'] = pd.Series(0, index=test_file.index) test_file['id_mediador'] = test_file['id_mediador'].map(int) test_file.loc[test_file['id_mediador'] == 62659, 'mediador'] = 1 # Indicator of RF test_file['indicator'] = pd.Series(0, index=test_file.index) # test_file.loc[test_file['fecha_diferencia_siniestro_efecto'] <= 30, 'indicator'] = 1 test_file.loc[test_file['fecha_diferencia_siniestro_emision'] <= 30, 'indicator'] = 1 test_file.loc[test_file['retraso_comunicacion'] == 1, 'indicator'] = 1 test_file.loc[test_file['fecha_ocurrencia_entre_efecto_emision'] == 1, 'indicator'] = 1 test_file.loc[test_file['mediador'] == 1, 'indicator'] = 1 test_file = test_file.add_prefix('RF_') test_file = test_file.rename(columns={'RF_id_siniestro': 'id_siniestro'}) return test_file
english_mapping = dict(get_english(x) for x in all_cats) def get_code(seq): return [x.split('.')[0] for x in seq if x] all_codes = get_code(all_cats) code_index = pd.Index(np.unique(all_codes)) dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index) for row, cat in zip(data.index, data.CATEGORY): codes = get_code(to_cat_list(cat)) dummy_frame.ix[row, codes] = 1 data = data.join(dummy_frame.add_prefix('category_')) # 与原数据表建立对应关系 def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25, lllon=-75, urlon=-71): # 创建极球面投影的Basemap实例 m = Basemap(ax=ax, projection='stere', lon_0=(urlon + lllon) / 2, lat_0=(urlon + lllat) / 2, llcrnrlat=lllat, urcrnrlat=urlat, llcrnrlon=lllon, urcrnrlon=urlon, resolution='f') # 绘制海岸线、州界、国界以及地图边界 m.drawcoastlines()
def slide_14(): data = pd.read_csv(HAICHICSVPATH) print data print data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10] print 'データのカテゴリ' print data['CATEGORY'][:6] print 'データの詳細' print data.describe() print '外れたところのデータと欠損値を外す' data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) & (data.LONGITUDE > -75) & (data.LONGITUDE < -70) & (data.CATEGORY.notnull())] def to_cat_list(catstr): stripped = (x.strip() for x in catstr.split(',')) return [x for x in stripped if x] def get_all_categories(cat_series): cat_sets = (set(to_cat_list(x)) for x in cat_series) return sorted(set.union(*cat_sets)) def get_english(cat): code, names = cat.split('.') if '|' in names: names = names.split(' | ')[1] return code, names.strip() all_cats = get_all_categories(data.CATEGORY) english_mapping = dict(get_english(x) for x in all_cats) print english_mapping['2a'] print english_mapping['6c'] def get_code(seq): return [x.split('.')[0] for x in seq if x] all_codes = get_code(all_cats) code_index = pd.Index(np.unique(all_codes)) dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index) print dummy_frame.ix[:, :6] print data.index for row, cat in zip(data.index, data.CATEGORY): codes = get_code(to_cat_list(cat)) dummy_frame.ix[row, codes] = 1 data = data.join(dummy_frame.add_prefix('category_')) print data.ix[:, 10:15] from mpl_toolkits.basemap import Basemap def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25, lllon=-75, urlon=-71): m = Basemap(ax=ax, projection='stere', lon_0=(urlon + lllon) / 2, lat_0=(urlat + lllat) / 2, llcrnrlat=lllat, urcrnrlat=urlat, llcrnrlon=lllon, urcrnrlon=urlon, resolution='f') m.drawcoastlines() m.drawstates() m.drawcountries() return m fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10)) fig.subplots_adjust(hspace=0.05, wspace=0.05) to_plot = ['2a', '1', '3c', '7a'] lllat = 17.25 urlat = 20.25 lllon = -75 urlon = -71 for code, ax in zip(to_plot, axes.flat): m = basic_haiti_map(ax, lllat=lllat, urlat=urlat, lllon=lllon, urlon=urlon) cat_data = data[data['category_%s' % code] == 1] x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values) m.plot(x, y, 'k.', alpha=0.5) ax.set_title('%s: %s' % (code, english_mapping[code])) m.readshapefile(SHAPEFILEPATH, 'roads')
import pandas as pd import numpy as np ###计算指标与哑变量 #将离散变量转化为哑变量形式 df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) pd.get_dummies(df['key']) dummies = pd.get_dummies(df['key'], prefix='key') #prefix设定哑变量的名称前缀 df_with_dummy = df[['data1']].join(dummies) df_with_dummy #分类不止一个的情况 mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table('d:/data/movies.dat', sep='::', header=None, names=mnames) movies[:10] genre_iter = (set(x.split('|')) for x in movies.genres) genres = sorted(set.union(*genre_iter)) dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres) for i, gen in enumerate(movies.genres): dummies.ix[i, gen.split('|')] = 1 movies_windic = movies.join(dummies.add_prefix('Genre_')) movies_windic.ix[0]
mnames = ['movie_id', 'tittle', 'genres'] movies = pd.read_table( '/Users/changyueh/Desktop/CodePractice/Data_Analysis/Chapt2/ml-1m/movies.dat', sep='::', header=None, names=mnames) movies[:10] genre_iter = (set(x.split('|')) for x in movies.genres) #迭代每一個genre genres = sorted(set.union(*genre_iter)) #列出所有的genre,經過去重 dummies = DataFrame(np.zeros((len(movies.index), len(genres))), columns=genres) #建構全是零,columns=genres數目的DF for i, gen in enumerate(movies.genres): dummies.loc[i, gen.split('|')] = 1 #說明寫在page216 dummies.head() movies_windic = movies.join( dummies.add_prefix('Genre_')) #直接用join,並且增加columns的前綴 movies_windic.iloc[0] values = np.random.rand(10) values bins = [0, 0.2, 0.4, 0.6, 0.8, 1] ranks = ['0<x<=0.2', '0.2<x<=0.4', '0.4<x<=0.6', '0.6<x<=0.8', '0.8<x<=1.'] pd.get_dummies(pd.cut(values, bins, labels=ranks)) #字符串操作 ##字符串對象方法,p218表7-3有所有的內置字符串方式 val = 'a,b, guido' val.split(',') #內置的split可以做很多事 pieces = [x.strip() for x in val.split(',')] pieces #strip會自動把空白取消
return [x.split(".")[0] for x in seq if x] # Read data from standard input on the command line sys.stdin = os.fdopen(sys.stdin.fileno(), "rU") data = pd.read_csv(sys.stdin) # Restrict to data in Haiti with categories data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) & (data.LONGITUDE > -75) & (data.LONGITUDE < -70) & data.CATEGORY.notnull()] # Extract categorizations all_cats = get_all_categories(data.CATEGORY) # Add indicator columns for categories all_codes = get_code(all_cats) code_index = pd.Index(np.unique(all_codes)) dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index) for row, cat in zip(data.index, data.CATEGORY): codes = get_code(to_cat_list(cat)) dummy_frame.ix[row, codes] = 1 data = data.join(dummy_frame.add_prefix("category_")) # Write data to standard output data.to_csv(sys.stdout)