def test_ffill(self): result = merge_ordered( self.left, self.right, on='key', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1., 1, 2, 2, 3, 3.], 'rvalue': [nan, 1, 2, 3, 3, 4]}) assert_frame_equal(result, expected)
def test_basic(self): result = merge_ordered(self.left, self.right, on='key') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1, nan, 2, nan, 3, nan], 'rvalue': [nan, 1, 2, 3, nan, 4]}) assert_frame_equal(result, expected)
def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) left['group'] = ['a'] * 3 + ['b'] * 3 result = merge_ordered(left, self.right, on='key', left_by='group', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) expected['group'] = ['a'] * 6 + ['b'] * 6 assert_frame_equal(result, expected.loc[:, result.columns]) result2 = merge_ordered(self.right, left, on='key', right_by='group', fill_method='ffill') assert_frame_equal(result, result2.loc[:, result.columns]) result = merge_ordered(left, self.right, on='key', left_by='group') assert result['group'].notna().all()
def test_basic(self): result = merge_ordered(self.left, self.right, on="key") expected = DataFrame( { "key": ["a", "b", "c", "d", "e", "f"], "lvalue": [1, nan, 2, nan, 3, nan], "rvalue": [nan, 1, 2, 3, nan, 4], } ) assert_frame_equal(result, expected)
def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) # right = concat([self.right, self.right], ignore_index=True) left["group"] = ["a"] * 3 + ["b"] * 3 # right['group'] = ['a'] * 4 + ['b'] * 4 result = merge_ordered(left, self.right, on="key", left_by="group", fill_method="ffill") expected = DataFrame( { "key": ["a", "b", "c", "d", "e", "f"] * 2, "lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2, "rvalue": [nan, 1, 2, 3, 3, 4] * 2, } ) expected["group"] = ["a"] * 6 + ["b"] * 6 assert_frame_equal(result, expected.ix[:, result.columns]) result2 = merge_ordered(self.right, left, on="key", right_by="group", fill_method="ffill") assert_frame_equal(result, result2.ix[:, result.columns]) result = merge_ordered(left, self.right, on="key", left_by="group") self.assertTrue(result["group"].notnull().all())
def test_doc_example(self): left = DataFrame({'key': ['a', 'c', 'e', 'a', 'c', 'e'], 'lvalue': [1, 2, 3] * 2, 'group': list('aaabbb')}) right = DataFrame({'key': ['b', 'c', 'd'], 'rvalue': [1, 2, 3]}) result = merge_ordered(left, right, fill_method='ffill', left_by='group') expected = DataFrame({'group': list('aaaaabbbbb'), 'key': ['a', 'b', 'c', 'd', 'e'] * 2, 'lvalue': [1, 1, 2, 2, 3] * 2, 'rvalue': [nan, 1, 2, 3, 3] * 2}) assert_frame_equal(result, expected)
import matplotlib.pyplot as plt import datetime import time from scipy import signal from collections import Counter # building basic data frame eurusd = pd.read_csv('eur_usd_hist.csv') eurusd = eurusd.drop(eurusd.index[0:6]) eurusd.index = range(len(eurusd)) eurusd.columns = ['date', 'eurusdclose', 'high', 'low'] usdjpn = pd.read_csv('jpn_hist.csv') usdjpn = usdjpn.drop(usdjpn.index[0:6]) usdjpn.index = range(len(usdjpn)) usdjpn.columns = ['date', 'usdjpyclose', 'usdcadclose'] df = pd.merge_ordered(eurusd, usdjpn, on='date') df = df.drop(['high', 'low'], axis=1) df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') #print(df.head()) #http://www.histdata.com/download-free-forex-historical-data/?/ascii/1-minute-bar-quotes/EURUSD mindata = pd.read_csv('nov01_07_2011.csv', header=None) mindata = mindata.drop(mindata.columns[[0, 1, 3, 4, 5]], axis=1) mindata.columns = ['date', 'eurusdclose'] mindata['date'] = mindata.index mindata['date'].iat[0] #http://www.histdata.com/download-free-forex-historical-data/?/ascii/1-minute-bar-quotes/EURUSD mindf_2017 = pd.read_csv('DAT_ASCII_EURUSD_M1_2017.csv', header=None, sep=';') mindf_2017 = mindf_2017.drop(mindf_2017.columns[[0, 1, 3, 4, 5]], axis=1) df = mindf_2017
def time_merge_ordered(self): merge_ordered(self.left, self.right, on='key', left_by='group')
def test_ffill(self): result = merge_ordered(self.left, self.right, on="key", fill_method="ffill") expected = DataFrame( {"key": ["a", "b", "c", "d", "e", "f"], "lvalue": [1.0, 1, 2, 2, 3, 3.0], "rvalue": [nan, 1, 2, 3, 3, 4]} ) assert_frame_equal(result, expected)
df_train = pd.read_csv("../inputs/train.csv", parse_dates=['timestamp']) df_test = pd.read_csv("../inputs/test.csv", parse_dates=['timestamp']) df_macro = pd.read_csv("../inputs/macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols) # ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130 ylog_train_all = np.log1p(df_train['price_doc'].values) id_test = df_test['id'] df_train.drop(['id', 'price_doc'], axis=1, inplace=True) df_test.drop(['id'], axis=1, inplace=True) # Build df_all = (df_train+df_test).join(df_macro) num_train = len(df_train) df_all = pd.concat([df_train, df_test]) df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left') print(df_all.shape) # Add month-year month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100) month_year_cnt_map = month_year.value_counts().to_dict() df_all['month_year_cnt'] = month_year.map(month_year_cnt_map) # Add week-year count week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100) week_year_cnt_map = week_year.value_counts().to_dict() df_all['week_year_cnt'] = week_year.map(week_year_cnt_map) # Add month and day-of-week df_all['month'] = df_all.timestamp.dt.month df_all['dow'] = df_all.timestamp.dt.dayofweek
import pandas as pd df1 = pd.DataFrame({ "key": ["a", "c", "e", "a", "c", "e"], "lvalue": [1, 2, 3, 1, 2, 3], "group": ["a", "a", "a", "b", "b", "b"] }) df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) print(pd.merge_ordered(df1, df2, fill_method="ffill", left_by="group"))
# Create a date column using the month and year columns of ur_tall ur_tall['date'] = pd.to_datetime(ur_tall['year'] + '-' + ur_tall['month']) # Sort ur_tall by date in ascending order ur_sorted = ur_tall.sort_values(by='date') # Plot the unempl_rate by date ur_sorted.plot(y='unempl_rate', x='date') plt.show() # Check inverse correlation between dow jones and US treasury bond movement # Use melt on ten_yr, unpivot everything besides the metric column bond_perc = ten_yr.melt(id_vars='metric', var_name='date', value_name='close') # Use query on bond_perc to select only the rows where metric=close bond_perc_close = bond_perc.query('metric=="close"').drop('metric', axis=1, inplace=True) # Merge (ordered) dji and bond_perc_close on date with an inner join dow_bond = pd.merge_ordered(dji, bond_perc_close, on='date', how='inner', suffixes=('_dow', '_bond')) # Plot only the close_dow and close_bond columns dow_bond.plot(y=['close_dow', 'close_bond'], x='date', rot=90) plt.show()
def RandomForest(df_x,df_y,winSize,winStep): ''' -------- parameter---------- DataFrame:{columns=["date","x1","x2", ..., "xn"]} DataFrame:{columns=["date","y"]} winSize: float winSteop: float ---------return---------- DataFrame:{columns=["date","y"]} assumption: 1. 'xi' has been sorted by 'date' 2. 'y' cloumn in 'X0' has been shifted ''' if isinstance(df_x, gftIO.GftTable): df_x = df_x.asColumnTab() if isinstance(df_y,dict): df_y = df_y["y"] if isinstance(df_y, gftIO.GftTable): df_y = df_y.asColumnTab() # convert parameter type winSize = int(winSize) winStep = int(winStep) # NOTICE: integer will be regraged as O by GS, but classifier need int value_column = _findValueColumn(df_y.columns) # value_column: value # df_y.columns:Index(['date', 'value'], dtype='object') df_y.rename(columns={value_column:"y"},inplace=True) df_y.y=pd.factorize(df_y.y)[0] # change column name for col_name in df_y.columns: if isinstance(df_y.ix[0,col_name],pd.Timestamp): df_y.rename(columns={col_name:"date"},inplace=True) break # remove meanless columns df_y=df_y[["date","y"]] # merge data df_x = df_x.sort_values("date",ascending=True) df_y = df_y.sort_values("date",ascending=True) df_y = df_y.set_index(np.arange(len(df_y))) # indentify index: start from 0 # frequency error: if y_freq > x_freq, meanless data ls_missing_date=[d for d in list(df_y["date"]) if d not in list(df_x["date"])] if len(ls_missing_date)>0: raise ValueError("y_freq > x_freq. Missing date in X:", ls_missing_date) # slice data: remove redundant x if len(df_x)!=len(df_y): ls_slice_data=[d for d in list(df_x["date"]) if d not in list(df_y["date"])] df_tmp_x=df_x.set_index(["date"]) df_tmp_x=df_tmp_x.drop(ls_slice_data) df_x=df_tmp_x.reset_index(np.arange(len(df_tmp_x)),drop=False) # identify index: start from 0 df_x = df_x.set_index(np.arange(len(df_x))) df_y = df_y.set_index(np.arange(len(df_y))) # data to be trained df_data=pd.merge_ordered(df_x,df_y,on="date",how="outer") # value check if len(df_data.index) < winSize + 1: raise ValueError("the number of input data is not enough") # rooling ls_predicted=[] for i in range(len(df_data.index)): if i<winSize: ls_predicted+=[np.nan] else: start_index=i-winSize # fit n_x_train= df_data.iloc[start_index:i,1:-1].values n_y_train= df_data.iloc[start_index:i,-1].values _CLASSIFIER.fit(n_x_train, n_y_train) # predict n_x_test = df_data.iloc[[i],1:-1] y_test = _CLASSIFIER.predict(n_x_test)[0] ls_predicted += [y_test] df_data["predicted"]=ls_predicted #print(ls_predicted) # drop na df_data=df_data.dropna() #print(df_data) # scoressssssss y_true=pd.factorize(df_data["y"])[0] y_pred=pd.factorize(df_data["predicted"])[0] num_accuracy_score=accuracy_score(y_true,y_pred) #print("accuracy_score:",num_accuracy_score) num_f1_score=f1_score(y_true,y_pred,average='macro') # micor, weighted, None #print("f1_score:",num_f1_score) num_precision_score=precision_score(y_true, y_pred, average='macro') # micor, weighted, None #print("precision_score:",num_precision_score) num_recall_score=recall_score(y_true, y_pred, average='macro') # micor, weighted, None #print("recall_score:",num_recall_score) dict_score={"accuracy_score":num_accuracy_score, "f1_score":num_f1_score,"precision_score":num_precision_score, "recall_score":num_recall_score} # score y_test = df_data["predicted"].values X_test = df_data.iloc[:,1:-2].values num_mean_accuracy=_CLASSIFIER.score(X_test , y_test) #print(num_score) ''' # feature_importances ls_fitness=list(zip(df_data.iloc[:,1:-1],_CLASSIFIER.feature_importances_)) n_fitness=np.array(list(map(list,ls_fitness))) df_fitness=pd.DataFrame({"feature":n_fitness[:,0],"importance":n_fitness[:,1]}) #print(df_fitness) ''' # result df_data=df_data[["date","predicted"]] #print(df_data) dict_result = {"result":df_data,"mean_accuracy":num_mean_accuracy, "scores":dict_score} #,"fitness":df_fitness} #print(dict_result) return dict_result
# Concatenate medals: medals medals = pd.concat(medals, keys=['bronze', 'silver', 'gold']) #then slice the index idx = pd.IndexSlice print(sales.loc[idx[:, 'Mediacore'], :]) #inner joins medal_list = [bronze, silver, gold] medals = pd.concat(medal_list, keys=['bronze', 'silver', 'gold'], axis=1, join='inner') #merging merge_by_id = pd.merge(revenue,managers,on='branch_id') #inner join combined = pd.merge(revenue,managers,left_on='city',right_on='branch') #right join pd.merge(sales, managers, left_on=['city','state'], right_on=['branch','state'], how='left') #this is default to outer join tx_weather_ffill = pd.merge_ordered(austin , houston,on='date',suffixes=['_aus','_hus'],fill_method='ffill') #case study # Import pandas import pandas as pd # Create empty dictionary: medals_dict medals_dict = {} for year in editions['Edition']: file_path = 'summer_{:d}.csv'.format(year) medals_dict[year] = pd.read_csv(file_path) medals_dict[year] = medals_dict[year][['Athlete', 'NOC', 'Medal']] medals_dict[year]['Edition'] = year # Concatenate medals_dict: medals
def time_merge_ordered(self): merge_ordered(self.left, self.right, on="key", left_by="group")
def add_dilation_to_fxd(GZD, FXD): GZD['dilation'] = l_r_dilation(GZD) FXD = pd.merge_ordered(FXD, GZD[['timestamp', 'dilation']], fill_method='ffill', left_by='timestamp') avg = FXD['dilation'].mean() FXD['dilation'] = FXD['dilation'].fillna(avg) return FXD
def time_merge_ordered(self): merge_ordered(self.left, self.right, on='key', left_by='group')
import numpy as np import pandas as pd from keras.layers import Dense, Activation, Dropout from keras.layers.recurrent import LSTM from keras.models import Sequential import lstm, time from sklearn import model_selection, preprocessing """Experimental Doesn't work"""" train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") macro = pd.read_csv("macro.csv", usecols=macro_features) macro_train = pd.merge_ordered(train, macro, on='timestamp', how='left') macro_test = pd.merge_ordered(test, macro, on='timestamp', how='left') id_test = macro_test.id y_train = macro_train["price_doc"] x_train = macro_train.drop(["id", "timestamp", "price_doc"], axis=1) x_test = macro_test.drop(["id", "timestamp"], axis=1) print x_train.shape #Step1 - Perform preprocessing for c in x_train.columns: if x_train[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(x_train[c].values)) x_train[c] = lbl.transform(list(x_train[c].values))
total_distance = total_distance + \ np.sqrt(pow((dfcity.X[city_num] - dfcity.X[prev_city]),2) + pow((dfcity.Y[city_num] - dfcity.Y[prev_city]),2)) * \ (1+ 0.1*((step_num % 10 == 0)*int(not(prime_cities[prev_city])))) prev_city = next_city step_num = step_num + 1 return total_distance dumbest_path = list(df_cities.CityId[:].append(pd.Series([0]))) print('Total distance with the dumbest path is '+ "{:,}".format(total_distance(df_cities,dumbest_path))) # ### Let us take a look at the first 100 steps of the dumbest path # In[ ]: df_path = pd.merge_ordered(pd.DataFrame({'CityId':dumbest_path}),df_cities,on=['CityId']) fig, ax = plt.subplots(figsize=(20,20)) ax.plot(df_path.iloc[0:100,]['X'], df_path.iloc[0:100,]['Y'],marker = 'o') for i, txt in enumerate(df_path.iloc[0:100,]['CityId']): ax.annotate(txt, (df_path.iloc[0:100,]['X'][i], df_path.iloc[0:100,]['Y'][i]),size = 15) # ### As we can see, the dumbest path seems pretty bad. We are sending Santa all over the map, without any consideration for him whatsoever :) # # ## Slightly better path: sort the cities in X,Y coordinate_order # In[ ]: sorted_cities = list(df_cities.iloc[1:,].sort_values(['X','Y'])['CityId']) sorted_cities = [0] + sorted_cities + [0] print('Total distance with the sorted city path is '+ "{:,}".format(total_distance(df_cities,sorted_cities)))