def adjust_hyper_param(df_X, df_Y, model_flag): if model_flag == 3: param = [{ "learning_rate": [0.05, 0.07, 0.1], "n_estimators": [60, 70, 80, 90, 100], "max_depth": [3, 4, 5], "subsample": [0.6, 0.7, 0.8] }] gdbt = gradient_boosting.GradientBoostingRegressor( loss="ls", criterion="friedman_mse", warm_start=True) grid = GridSearchCV(gdbt, param_grid=param, scoring="neg_mean_squared_error", n_jobs=-1, cv=5, return_train_score=True) grid.fit(df_X, df_Y) print(grid.best_params_) print(grid.best_score_) print(grid.best_estimator_) res = grid.cv_results_ output.write_csv(pd.DataFrame(res), 2207, "2207_GDBT_grid_search")
def get_stock_info(options, stock_code): base_url = "https://www.cnyes.com/twstock/ps_historyprice/" url = base_url + str(stock_code) + ".htm" driver = webdriver.Chrome(chrome_options=options) driver.get(url) driver.maximize_window() #設定開始日期為2009/01/01 start_date = driver.find_element_by_xpath( "//*[@id='ctl00_ContentPlaceHolder1_startText']") driver.execute_script("arguments[0].value = '2009/01/01';", start_date) #設定結束日期為2019/12/31 end_date = driver.find_element_by_xpath( "//*[@id='ctl00_ContentPlaceHolder1_endText']") driver.execute_script("arguments[0].value = '2019/12/31';", end_date) search_btn = driver.find_element_by_xpath( "//*[@id='ctl00_ContentPlaceHolder1_submitBut']") search_btn.click() driver.minimize_window() col_name = [] data = [] for i in range(1, 11): name = driver.find_element_by_xpath( "//*[@id='main3']/div[5]/div[3]/table/tbody/tr[1]/th[" + str(i) + "]") col_name.append(name.text) else: data.append(col_name) ct = 1 table_element = driver.find_element_by_xpath( "//*[@id='main3']/div[5]/div[3]/table/tbody") temp = [] try: for i in table_element.text.split("\n"): for j in i.split(" "): if ct > 10: ct = 1 data.append(temp) temp = [] temp.append(j) ct += 1 else: df = pd.DataFrame(data[2:], columns=col_name) output.write_csv(df, stock_code, "2207_info") driver.close() except Exception as e: print(e) print("Fail to get stock info,stock_code : {}".format(stock_code))
def KD_value(df): print(df) K_lst = [50] D_lst = [50] for i in reversed(range(df.shape[0] - 8)): RSV = (df.iloc[i, 4] - min(df.iloc[i:i + 9, 3])) / ( max(df.iloc[i:i + 9, 2]) - min(df.iloc[i:i + 9, 3])) today_K = K_lst[i - (df.shape[0] - 9)] * 0.67 + RSV * 0.33 K_lst.append(today_K) today_D = D_lst[i - (df.shape[0] - 9)] * 0.67 + today_K * 0.33 D_lst.append(today_D) KD_df = pd.DataFrame( dict( zip(["K_value", "D_value"], [list(reversed(K_lst)), list(reversed(D_lst))]))) output.write_csv(KD_df, 2207, "2207_KD_value")
def RSI_value(df): catch_range_lst = [3, 5, 7, 10] fluctuation_lst = df["漲跌"].to_list() fluctuation_up_lst = [] fluctuation_down_lst = [] for i in fluctuation_lst: if i > 0: fluctuation_up_lst.append(i) fluctuation_down_lst.append(0) elif i < 0: fluctuation_up_lst.append(0) fluctuation_down_lst.append(abs(i)) else: fluctuation_up_lst.append(0) fluctuation_down_lst.append(0) accumulative_up_lst = [] accumulative_down_lst = [] for i in range(len(catch_range_lst)): #每n個一取,最後就會有n - 1個取不到(沒有資料了) temp_lst = [] for j in range(len(fluctuation_up_lst) - (catch_range_lst[i] - 1)): temp = 0 for k in range(catch_range_lst[i]): temp += fluctuation_up_lst[j + k] temp /= catch_range_lst[i] temp_lst.append(temp) accumulative_up_lst.append(temp_lst) for i in range(len(catch_range_lst)): #每n個一取,最後就會有n - 1個取不到(沒有資料了) temp_lst = [] for j in range(len(fluctuation_down_lst) - (catch_range_lst[i] - 1)): temp = 0 for k in range(catch_range_lst[i]): temp += fluctuation_down_lst[j + k] temp /= catch_range_lst[i] temp_lst.append(temp) accumulative_down_lst.append(temp_lst) RSI_lst = [] for i in range(len(accumulative_up_lst)): temp_lst = [] for j in range(len(accumulative_up_lst[i])): RSI = 100 * accumulative_up_lst[i][j] / ( accumulative_up_lst[i][j] + accumulative_down_lst[i][j]) temp_lst.append(RSI) RSI_lst.append(temp_lst) #對齊所有尺度的RSI值 for i in range(3): RSI_lst[i] = RSI_lst[i][:len(RSI_lst[3])] RSI_df = pd.DataFrame( dict(zip(["3日RSI", "5日RSI", "7日RSI", "10日RSI"], RSI_lst))) RSI_up_thrend = [] RSI_down_thrend = [] for i in range(RSI_df.shape[0]): ct = 0 if RSI_df.iloc[i, 0] > RSI_df.iloc[i, 1]: ct = 1 if RSI_df.iloc[i, 1] > RSI_df.iloc[i, 2]: ct = 2 if RSI_df.iloc[i, 2] > RSI_df.iloc[i, 3]: ct = 3 RSI_up_thrend.append(ct) for i in range(RSI_df.shape[0]): ct = 0 if RSI_df.iloc[i, 1] > RSI_df.iloc[i, 0]: ct = 1 if RSI_df.iloc[i, 2] > RSI_df.iloc[i, 1]: ct = 2 if RSI_df.iloc[i, 3] > RSI_df.iloc[i, 2]: ct = 3 RSI_down_thrend.append(ct) RSI_df["RSI_up_thrend"] = RSI_up_thrend RSI_df["RSI_down_thrend"] = RSI_down_thrend output.write_csv(RSI_df, 2207, "2207_RSI_value")
def MACD_value(df): #構建出的list都是由2009年進展到2019年 DI_lst = [] EMA12_lst = [] EMA26_lst = [] for i in reversed(range(df.shape[0])): DI_lst.append((df.iloc[i, 2] + df.iloc[i, 3] + 2 * df.iloc[i, 4]) / 4) for i in range(11, len(DI_lst)): EMA12_lst.append(sum(DI_lst[i - 11:i + 1]) / 12) for i in range(25, len(DI_lst)): EMA26_lst.append(sum(DI_lst[i - 25:i + 1]) / 26) #EMA平滑化 for i in range(1, len(EMA12_lst)): EMA12_lst[i] = EMA12_lst[i - 1] * 11 / 13 + DI_lst[11:][i] * 2 / 13 for i in range(1, len(EMA26_lst)): EMA26_lst[i] = EMA26_lst[i - 1] * 25 / 27 + DI_lst[25:][i] * 2 / 27 #丟棄第一天 EMA12_lst.pop(0) EMA26_lst.pop(0) EMA12_lst = EMA12_lst[len(EMA12_lst) - len(EMA26_lst):] DIF_lst = [] for i in range(len(EMA12_lst)): DIF_lst.append(EMA12_lst[i] - EMA26_lst[i]) first_MACD = sum(DIF_lst[:9]) / 9 MACD_lst = [first_MACD] for i in range(len(DIF_lst[10:])): MACD_lst.append(MACD_lst[i - 1] * 4 / 5 + DIF_lst[i] / 5) #丟棄前9天,使其與MACD_lst的長度保持一致 DIF_lst = DIF_lst[9:] df = pd.DataFrame( dict( zip(["DIF", "MACD"], [list(reversed(DIF_lst)), list(reversed(MACD_lst))]))) MACD_buy = [] MACD_sell = [] for i in range(1, df.shape[0]): if df.iloc[i - 1, 0] < df.iloc[i - 1, 1]: if df.iloc[i, 0] > df.iloc[i, 1]: MACD_buy.append(1) else: MACD_buy.append(0) else: MACD_buy.append(0) if df.iloc[i - 1, 0] > df.iloc[i - 1, 1]: if df.iloc[i, 0] < df.iloc[i, 1]: MACD_sell.append(1) else: MACD_sell.append(0) else: MACD_sell.append(0) df.drop(0, axis=0, inplace=True) df["MACD_buy"] = MACD_buy df["MACD_sell"] = MACD_sell output.write_csv(df, 2207, "2207_MACD_value")
def hard_insert(): path = os.getcwd() financial_df, mis_df, ie_df, accounting_df = read_file() source_lst_1 = [] source_lst_2 = [] #數學系 with open(path + "/1081/2104_e.html", 'r', encoding='utf-8') as file: source_lst_1.append(file.read()) with open(path + "/1082/2104_e.html", 'r', encoding='utf-8') as file: source_lst_2.append(file.read()) #企管系 with open(path + "/1081/5204_e.html", 'r', encoding='utf-8') as file: source_lst_1.append(file.read()) with open(path + "/1082/5204_e.html", 'r', encoding='utf-8') as file: source_lst_2.append(file.read()) #經濟系 with open(path + "/1081/5104_e.html", 'r', encoding='utf-8') as file: source_lst_1.append(file.read()) with open(path + "/1082/5104_e.html", 'r', encoding='utf-8') as file: source_lst_2.append(file.read()) df_lst_1 = [] df_lst_2 = [] remine_cols = [ "Year Standing", "Course ID", "Course Title", "Credit", "Credit type", "Day/Period", "Remarks(Might contain Chinese due to course remarks which cannot be translated afterwards)" ] for i in source_lst_1: df = pd.read_html(i) df = df[0] df = df[remine_cols] df.columns = [ "Year Standing", "Course ID", "Course Title", "Credit", "Credit type", "Day/Period", "Remarks" ] df['semester'] = 1 df_lst_1.append(df) for i in source_lst_2: df = pd.read_html(i) df = df[0] df = df[remine_cols] df.columns = [ "Year Standing", "Course ID", "Course Title", "Credit", "Credit type", "Day/Period", "Remarks" ] df['semester'] = 2 df_lst_2.append(df) df_lst = concat_df_lst(df_lst_1, df_lst_2) math_df = df_lst[0] manage_df = df_lst[1] eco_df = df_lst[2] math_df.reset_index(inplace=True) manage_df.reset_index(inplace=True) eco_df.reset_index(inplace=True) #新增數學系支援的課程 mis_insert_df = math_df.loc[ math_df["Course Title"].str.contains("Calculus") & math_df["Remarks"].str.contains("Information Management")] ie_insert_df = math_df.loc[ math_df["Course Title"].str.contains("Calculus") & math_df["Remarks"].str.contains("Computer Science")] financial_insert_df = math_df.loc[ math_df["Course Title"].str.contains("Calculus") & math_df["Remarks"].str.contains("Finance and Banking")] accounting_insert_df = math_df.loc[ math_df["Course Title"].str.contains("Calculus") & math_df["Remarks"].str.contains( "Accounting and Information Technology")] #新增企管系支援的課程 financial_insert_df = pd.concat([ financial_insert_df, manage_df.loc[ manage_df["Course Title"].str.contains("Introduction to Business") & manage_df["Remarks"].str.contains("Finance and Banking")] ], axis=0, ignore_index=True) accounting_insert_df = pd.concat([ accounting_insert_df, manage_df.loc[manage_df["Course Title"].str.contains( "Seminar on Humanistic and Business Ethics") & manage_df["Remarks"].str.contains( "Accounting and Information Technology")] ], axis=0, ignore_index=True) mis_insert_df = pd.concat([ mis_insert_df, manage_df.loc[ manage_df["Course Title"].str.contains("Introduction to Business") & manage_df["Remarks"].str.contains("Information Management")] ], axis=0, ignore_index=True) mis_insert_df = pd.concat([ mis_insert_df, manage_df.loc[ manage_df["Course Title"].str.contains("Business Ethics") & manage_df["Remarks"].str.contains("Information Management")] ], axis=0, ignore_index=True) #新增經濟學系支援的課程 financial_insert_df = pd.concat([ financial_insert_df, eco_df.loc[ eco_df["Course Title"].str.contains("Principle of Economics") & eco_df["Remarks"].str.contains("Finance and Banking")] ], axis=0, ignore_index=True) financial_insert_df = pd.concat([ financial_insert_df, eco_df.loc[eco_df["Course Title"].str.contains("Microeconomics") & eco_df["Remarks"].str.contains("Finance and Banking")] ], axis=0, ignore_index=True) accounting_insert_df = pd.concat([ accounting_insert_df, eco_df.loc[ eco_df["Course Title"].str.contains("Principle of Economics") & eco_df["Remarks"].str.contains( "Accounting and Information Technology")] ], axis=0, ignore_index=True) mis_insert_df = pd.concat([ mis_insert_df, eco_df.loc[ eco_df["Course Title"].str.contains("Principle of Economics") & eco_df["Remarks"].str.contains("Information Management")] ], axis=0, ignore_index=True) #新增資管系支援的課程 accounting_insert_df = pd.concat([ accounting_insert_df, mis_df.loc[ mis_df["Course Title"].str.contains("Introduction to Computer") & mis_df["Remarks"].str.contains( "Accounting and Information Technology")] ], axis=0, ignore_index=True) #新增財金系的支援課程 accounting_insert_df = pd.concat([ accounting_insert_df, financial_df.loc[ financial_df["Course Title"].str.contains("Statistics") & financial_df["Remarks"].str.contains( "Accounting and Information Technology")] ], axis=0, ignore_index=True) mis_insert_df = pd.concat([ mis_insert_df, financial_df.loc[ financial_df["Course Title"].str.contains("Statistics") & financial_df["Remarks"].str.contains("Information Management")] ], axis=0, ignore_index=True) #新增會資系支援的課程 mis_insert_df = pd.concat([ mis_insert_df, accounting_df.loc[ accounting_df["Course Title"].str.contains("Accounting") & accounting_df["Remarks"].str.contains("Information Management")] ], axis=0, ignore_index=True) financial_insert_df.drop(['Remarks', "index"], axis=1, inplace=True) financial_df.drop("Remarks", axis=1, inplace=True) financial_df = pd.concat([financial_df, financial_insert_df], axis=0, ignore_index=True) mis_insert_df.drop(['Remarks', 'index'], axis=1, inplace=True) mis_df.drop("Remarks", axis=1, inplace=True) mis_df = pd.concat([mis_df, mis_insert_df], axis=0, ignore_index=True) ie_insert_df.drop(['Remarks', 'index'], axis=1, inplace=True) ie_df.drop("Remarks", axis=1, inplace=True) ie_df = pd.concat([ie_df, ie_insert_df], axis=0, ignore_index=True) accounting_insert_df.drop(['Remarks', 'index'], axis=1, inplace=True) #手動刪除會資系的 Principle of Economics(II) accounting_insert_df.drop(accounting_insert_df[ accounting_insert_df["Course ID"] == 5101002].index, axis=0, inplace=True) accounting_df.drop("Remarks", axis=1, inplace=True) accounting_df = pd.concat([accounting_df, accounting_insert_df], axis=0, ignore_index=True, sort=True) #按照年級和學期排列 financial_df.sort_values(by=["Year Standing", "semester"], ascending=True, inplace=True) mis_df.sort_values(by=["Year Standing", "semester"], ascending=True, inplace=True) ie_df.sort_values(by=["Year Standing", "semester"], ascending=True, inplace=True) accounting_df.sort_values(by=["Year Standing", "semester"], ascending=True, inplace=True) #對個別可以抵免的課程進行特殊處理 #會資系的微積分先暫定變成微積分(微積分(一) -> 微積分) (取消) #會資系的統計學先暫定變成統計學(一) (方便比對) #accounting_df.loc[accounting_df["Course Title"] == "Calculus (I)" , "Course Title"] = "Calculus" accounting_df.loc[accounting_df["Course Title"] == "Statistics", "Course Title"] = "Statistics (I)" new_df_lst = [ie_df, financial_df, accounting_df, mis_df] output.write_csv(new_df_lst)