def makeImportVocab(cls, basic_path=None, keyword_csv_file=None, important_vocab_csv_file=None): #初始化源文件路径和存储文件路径 if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if keyword_csv_file is None: return None if important_vocab_csv_file is None: return None input_data_path = os.path.join(basic_path, keyword_csv_file) output_data_path = os.path.join(basic_path, important_vocab_csv_file) VTool.makeDirs(files=[output_data_path]) #清空接收路径下的文件,初始化列名 pd.DataFrame({ "0": [], "1": [] }).to_csv(output_data_path, index=False, encoding="utf-8") i = 0 vocab = {} reader = pd.read_csv(input_data_path, chunksize=5000) for sentences in reader: for sentence in sentences['1']: i += 1 print(i) if str(sentence) == 'nan': continue words = sentence.split(" ") for word in words: if word not in vocab: vocab[word] = 1 else: vocab[word] += 1 sorted_vocab = sorted(vocab.items(), key=lambda v: v[1], reverse=True) data = [] for word, num in sorted_vocab: data.append([word, num]) if len(data) != 0: pd.DataFrame(data).to_csv(output_data_path, index=False, header=False, mode="a", encoding="utf-8")
def train(self, basic_path=None, input_file=None, output_folder=None, embedding_dim=0, folder_extra='', filter_sizes=None, reduce_num=0, test_part_start=0.9, test_part_end=1, data_stand=False, times=10): if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if input_file is None or output_folder is None: return None if filter_sizes is not None: self.filter_sizes = filter_sizes input_path = os.path.join(basic_path, input_file) output_path = os.path.join(basic_path, output_folder) VTool.makeDirs(folders=[output_path]) self.embedding_dim = embedding_dim self.filter_sizes = filter_sizes print("Writing to {}\n".format(output_path)) tf.reset_default_graph() with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=self.allow_soft_placement, log_device_placement=self.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): x_train, y_train, batch_index, _, _, _ = DataHelper.get_number_data( file=input_path, batch_size=self.batch_size, reduce_num=reduce_num, test_part_start=test_part_start, test_part_end=test_part_end, stand=data_stand) if len(x_train) <= 0: print("CSV No Data!!!") exit() print("x.shape = {}".format(x_train.shape)) print("y.shape = {}".format(y_train.shape)) cnn = CnnImage(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], embedding_size=self.embedding_dim, filter_sizes=self.filter_sizes, num_filters=self.num_filters, full_layer_filters=self.full_layer_filters, l2_reg_lambda=self.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(self.learn_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) ''' # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries print("Writing to {}\n".format(output_path)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) summary_dir = os.path.join(output_path, "train_summaries") summary_writer = tf.summary.FileWriter(summary_dir, sess.graph) VTool.makeDirs(folders=[summary_dir]) ''' checkpoint_dir = os.path.abspath( os.path.join(output_path, "checkpoints" + folder_extra)) checkpoint_prefix = os.path.join(checkpoint_dir, "model") VTool.makeDirs(folders=[checkpoint_dir]) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) for i in range(times): for step in range(len(batch_index) - 1): feed_dict = { cnn.input_x: x_train[batch_index[step]:batch_index[step + 1]], cnn.input_y: y_train[batch_index[step]:batch_index[step + 1]], cnn.dropout_keep_prob: self.dropout_keep_prob } _, loss, accuracy, predictions, input_y_index = sess.run( [ train_op, cnn.loss, cnn.accuracy, cnn.predictions, cnn.input_y_index ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) all_accuracy = cnn.various_accuracy( self.num_labels, input_y_index.tolist(), predictions.tolist()) for a in all_accuracy: print( "input_nums: {:g}, pre_nums: {:g}, right_nums: {:g}, accuracy: {:g}" .format(a[0], a[1], a[2], a[3])) # summary_writer.add_summary(summaries, step) if i % 5 == 0: print("保存模型:", saver.save(sess, checkpoint_prefix)) print("保存模型:", saver.save(sess, checkpoint_prefix)) print("The train has finished")
def calcuWordTrend(self, cur=None, choose_dates=None, basic_path=None, word_cache_file=None, output_file=None): if cur == None or choose_dates == None or output_file == None or word_cache_file == None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) word_cache_path = os.path.join(basic_path, word_cache_file) index = 0 is_reload = False for choose_date in choose_dates: output_path = os.path.join(basic_path, output_file + '_%s.csv' % index) index += 1 if os.path.exists(output_path) and not is_reload: continue VTool.makeDirs(files=[output_path]) date_str_arr = [] date_rate = {} for k in choose_date: for d in choose_date[k]: date = d[1] date_str_arr.append('"' + date + '"') date_rate[date] = d[2] date_str = ",".join(date_str_arr) news = [] if len(date_str_arr) > 0: cur.execute( "SELECT id, time FROM news WHERE time in (%s) order by time, content" % (date_str)) news_temp = cur.fetchall() news_by_id = {} for n in news_temp: news_by_id[n[0]] = {} news_by_id[n[0]]['date'] = str(n[1]) news_by_id[n[0]]['words'] = '' del news_temp nid_len = len(news_by_id) reader = pd.read_csv(word_cache_path, chunksize=1000) for sentences in reader: for k in sentences['1'].keys(): nid = sentences['0'][k] if nid in news_by_id and news_by_id[nid]['words'] == '': news_by_id[nid]['words'] = str( sentences['1'][k]).split(" ") nid_len -= 1 if nid_len <= 0: break reader.close() del reader, sentences print(len(news_by_id)) word_dict = { "words": {}, "up_total_words": 0, "down_total_words": 0 } i = 0 for k in news_by_id: date = news_by_id[k]['date'] if date not in date_rate: continue if date_rate[date] >= 0: ckey = "up" else: ckey = "down" words = news_by_id[k]['words'] for w in words: if w not in word_dict["words"]: word_dict["words"][w] = {"up": 0, "down": 0} word_dict["words"][w][ckey] += 1 word_dict["%s_total_words" % ckey] += 1 i += 1 print(i) if word_dict["up_total_words"] != 0: for w in word_dict["words"]: word_dict["words"][w]["up"] = word_dict["words"][w][ "up"] / word_dict["up_total_words"] if word_dict["down_total_words"] != 0: for w in word_dict["words"]: word_dict["words"][w]["down"] = word_dict["words"][w][ "down"] / word_dict["down_total_words"] csv_data = [] for w in word_dict["words"]: csv_data.append([ w, word_dict["words"][w]["up"], word_dict["words"][w]["down"] ]) csv_data.append([ 'total_words', word_dict["up_total_words"], word_dict["down_total_words"] ]) pd.DataFrame({ "0": [], "1": [], "2": [] }).to_csv(output_path, index=False, encoding="utf-8") pd.DataFrame(csv_data).to_csv(output_path, index=False, header=False, mode="a", encoding="utf-8")
def makeTrendStockOriginCsv(cls, cur=None, start_date=None, end_date=None, day_num=3, basic_path=None, stock_id=None, word_trend_file=None, news_file=None, output_file=None): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or stock_id is None or output_file is None or word_trend_file is None or news_file is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) word_trend_path = os.path.join(basic_path, word_trend_file) news_path = os.path.join(basic_path, news_file) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) #清空接收路径下的文件,初始化列名 pd.DataFrame({ "0": [], "1": [] }).to_csv(output_path, index=False, encoding="utf-8") word_trend = {} word_trend_temp = pd.read_csv(word_trend_path) for k in word_trend_temp["0"].keys(): word_trend[word_trend_temp["0"][k]] = [ word_trend_temp["1"][k], word_trend_temp["2"][k] ] p_up = word_trend['total_words'][0] / (word_trend['total_words'][0] + word_trend['total_words'][1]) p_down = word_trend['total_words'][1] / (word_trend['total_words'][0] + word_trend['total_words'][1]) cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] deviation = 2 skip = 100 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - day_num - deviation < 0 else slimit - day_num - deviation, skip if slimit - day_num - deviation < 0 else skip + day_num + deviation)) history_tt = cur.fetchall() history_t = [] for h in history_tt: history_t.append([ int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9]) ]) del history_tt history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'opening': history_temp[1], 'closing': history_temp[2], 'difference': history_temp[3], 'percentage_difference': history_temp[4], 'lowest': history_temp[5], 'highest': history_temp[6], 'volume': history_temp[7], 'amount': history_temp[8], 'date': history_temp[9] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) ''' ''' sdate = str(history['date'][history['date'].keys()[0]]) edate = str(history['date'][history['date'].keys()[-1]]) cur.execute( "SELECT GROUP_CONCAT(id SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' group by time" % (sdate, edate)) news_temp = cur.fetchall() news_by_date = {} news_by_id = {} for n in news_temp: news_by_date[str(n[1])] = n[0].split(",") for nid in news_by_date[str(n[1])]: news_by_id[nid] = None del news_temp nid_len = len(news_by_id) reader = pd.read_csv(news_path, chunksize=1000) for sentences in reader: if nid_len > 0: for k in sentences['1'].keys(): nid = str(sentences['0'][k]) if nid in news_by_id and news_by_id[nid] == None: news_by_id[nid] = str(sentences['1'][k]).split(" ") wp_up = p_up wp_down = p_down for w in news_by_id[nid]: if w not in word_trend: wp_up *= (1 / word_trend['total_words'][0]) wp_down *= (1 / word_trend['total_words'][1]) else: if word_trend[w][0] > 0: wp_up *= word_trend[w][0] else: wp_up *= (1 / word_trend['total_words'][0]) if word_trend[w][1] > 0: wp_down *= word_trend[w][1] else: wp_down *= ( 1 / word_trend['total_words'][1]) while True: if wp_up < 1 and wp_down < 1: wp_up *= 10 wp_down *= 10 else: break news_by_id[nid] = [ wp_up / (wp_up + wp_down), -1 * wp_down / (wp_up + wp_down) ] nid_len -= 1 if nid_len <= 0: break else: break reader.close() del reader, sentences for d in news_by_date: sumn = [0, 0] for nid in news_by_date[d]: sumn[0] += news_by_id[nid][0] sumn[1] += news_by_id[nid][1] le = len(news_by_date[d]) if le > 0: sumn[0] /= le sumn[1] /= le news_by_date[d] = sumn print(d) history['news_pos_num'] = 0 history['news_neg_num'] = 0 for i in history.index: history.loc[i, 'rate'] = str( np.round(float(history['rate'][i]), 2)) if str(history['date'][i]) in news_by_date: history.loc[i, 'news_pos_num'] = str( np.round( float(news_by_date[str(history['date'][i])][0]), 2)) history.loc[i, 'news_neg_num'] = str( np.round( float(news_by_date[str(history['date'][i])][1]), 2)) else: history.loc[i, 'news_pos_num'] = "0" history.loc[i, 'news_neg_num'] = "0" #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_stock.name) data_temp_x = data_stock[[ "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "news_pos_num", "news_neg_num" ]] data_temp_y = data_stock[["rate", "date", "stock_id"]] data_res = [] for i in range(day_num - 1, len(data_temp_x.index) - deviation): data_res.append([ data_temp_x.iloc[i - day_num + 1:i + 1].values.reshape( day_num, 10).tolist() ] + data_temp_y.iloc[i + deviation].values.reshape( 1, 3).tolist()) if len(data_res) != 0: pd.DataFrame(data_res).to_csv(output_path, index=False, header=False, mode="a") g_stock_num = history.groupby(by=["stock_id"]) cls.groupby_skip = False g_stock_num.apply(func_train_data) slimit += skip
def makeTextOriginCsv(cls, cur=None, start_date=None, end_date=None, day_num=1, basic_path=None, input_file=None, output_file=None, stock_id=None, rewrite=True): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or input_file is None or output_file is None or stock_id is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) input_path = os.path.join(basic_path, input_file) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) ''' ''' cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] if rewrite == True: pd.DataFrame({"0": [], "1": []}).to_csv(output_path, index=False) deviation = 2 skip = 50 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id,closing,date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - deviation - day_num < 0 else slimit - deviation - day_num, skip if slimit - deviation - day_num < 0 else skip + deviation + day_num)) history_t = cur.fetchall() sdate = str(history_t[0][2]) edate = str(history_t[-1][2]) history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'closing': history_temp[1], 'date': history_temp[2] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) ''' ''' cur.execute( "SELECT GROUP_CONCAT(id SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' GROUP BY time order by time " % (sdate, edate)) news_temp = cur.fetchall() news_date = {} for k in history['date'].keys(): if (k - deviation - day_num + 1) in history['date']: news_date[str(history['date'][k])] = [ str(history['date'][k - deviation - day_num + 1]), str(history['date'][k - deviation]) ] news_by_date = {} news_by_id = {} for n in news_temp: news_by_date[str(n[1])] = n[0].split(",") for nid in news_by_date[str(n[1])]: news_by_id[nid] = '' del news_temp nid_len = len(news_by_id) reader = pd.read_csv(input_path, chunksize=1000) for sentences in reader: for k in sentences['1'].keys(): nid = str(sentences['0'][k]) if nid in news_by_id and news_by_id[nid] == '': news_by_id[nid] = str(sentences['1'][k]).split(" ") nid_len -= 1 if nid_len <= 0: break reader.close() del reader, sentences def func_train_data(date_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None date = str(date_stock.name) if date not in news_date: return sdate = datetime.datetime.strptime(news_date[date][0], '%Y-%m-%d') edate = datetime.datetime.strptime(news_date[date][1], '%Y-%m-%d') words = [] while sdate <= edate: cur_date = sdate.strftime('%Y-%m-%d') sdate += datetime.timedelta(days=1) if cur_date not in news_by_date: print("%s error" % cur_date) return None for i in news_by_date[cur_date]: words += news_by_id[i] data = [] for k in date_stock['stock_id'].keys(): data.append([[" ".join(words)], [ str(np.round(float(history['rate'][k]), 2)), str(date_stock['date'][k]), str(date_stock['stock_id'][k]) ]]) print("正在处理的日期:%s" % date_stock.name) pd.DataFrame(data).to_csv(output_path, index=False, header=False, mode="a", encoding="utf-8") g_stock = history.groupby(by=["date"]) cls.groupby_skip = False g_stock.apply(func_train_data) slimit += skip
def makeBindexOriginCsv(cls, cur=None, words=None, start_date=None, end_date=None, day_num=1, basic_path=None, output_file=None, stock_id=None): #初始化源文件路径和存储文件路径 if cur is None or words is None or start_date is None or end_date is None or stock_id is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if output_file is None: output_file = "bindex_data.csv" output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) #清空接收路径下的文件,初始化列名 pd.DataFrame({ "0": [], "1": [] }).to_csv(output_path, index=False, encoding="utf-8") start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d') end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') for i in range(len(words)): words[i] = "'" + words[i] + "'" words_str = ",".join(words) cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] deviation = 2 skip = 100 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id,closing,date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - deviation - day_num < 0 else slimit - deviation - day_num, skip if slimit - deviation - day_num < 0 else skip + deviation + day_num)) history_t = cur.fetchall() sdate = str(history_t[0][2]) edate = str(history_t[-1][2]) history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'closing': history_temp[1], 'date': history_temp[2] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) ''' ''' cur.execute( "SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc" % (words_str, sdate, edate)) bindex = cur.fetchall() news_date = {} for k in history['date'].keys(): if (k - deviation - day_num + 1) in history['date']: news_date[str(history['date'][k])] = [ str(history['date'][k - deviation - day_num + 1]), str(history['date'][k - deviation]) ] bindex_t = [] bindex_vec = 0 cur_date = None if len(bindex) > 0: cur_date = str(bindex[0][2]) bix = [] for bi in bindex: if str(bi[2]) != cur_date: bindex_t.append([bix, cur_date]) cur_date = str(bi[2]) bix = [] bix_temp = json.loads(bi[1]) bix_temp = sorted(bix_temp.items(), key=lambda v: v[0]) for k, b in bix_temp: bix_list = sorted(b.items(), key=lambda v: v[0]) for kk, bb in bix_list: bix.append(bb) if bindex_vec == 0: bindex_vec = len(bix) bindex_t.append([bix, cur_date]) del bindex bindex_by_date = {} for k in range(1, len(bindex_t)): b_t = [] for kk in range(len(bindex_t[k][0])): if int(bindex_t[k][0][kk]) != 0 and int( bindex_t[k - 1][0][kk]) != 0: b_t.append( str( np.round( float(100 * (int(bindex_t[k][0][kk]) / int(bindex_t[k - 1][0][kk]) - 1)), 2))) else: b_t.append(str(0.00)) bindex_by_date[bindex_t[k][1]] = b_t del bindex_t def func_train_data(date_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None date = str(date_stock.name) if date not in news_date: return sdate = datetime.datetime.strptime(news_date[date][0], '%Y-%m-%d') edate = datetime.datetime.strptime(news_date[date][1], '%Y-%m-%d') bindexs = [] while sdate <= edate: cur_date = sdate.strftime('%Y-%m-%d') sdate += datetime.timedelta(days=1) if cur_date not in bindex_by_date: print("%s error" % cur_date) exit() else: bindexs += bindex_by_date[cur_date] data = [] for k in date_stock['stock_id'].keys(): data.append([(np.array(bindexs).reshape( int(len(bindexs) / bindex_vec), bindex_vec)).tolist(), [ str(np.round(float(history['rate'][k]), 2)), str(date_stock['date'][k]), str(date_stock['stock_id'][k]) ]]) print("正在处理的日期:%s" % date_stock.name) pd.DataFrame(data).to_csv(output_path, index=False, header=False, mode="a", encoding="utf-8") g_stock = history.groupby(by=["date"]) cls.groupby_skip = False g_stock.apply(func_train_data) slimit += skip
def makeNewsKeywordCacheCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, analyse_type='tfidf', rewrite=True): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if analyse_type not in ['tfidf', 'textrank', 'all', 'title']: return None tfidf = analyse.extract_tags textrank = analyse.textrank origin_data_path = os.path.join(basic_path, "%s_keyword_cache.csv" % analyse_type) VTool.makeDirs(files=[origin_data_path]) #清空接收路径下的文件,初始化列名 if rewrite == True: pd.DataFrame({ "0": [], "1": [] }).to_csv(origin_data_path, index=False, encoding="utf-8") skip = 30 start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d') start_date -= datetime.timedelta(days=1) end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') i = 1 while start_date <= end_date: start_date += datetime.timedelta(days=1) cur_date = start_date.strftime('%Y-%m-%d') start_date += datetime.timedelta(days=skip) if start_date > end_date: cur_end_date = end_date.strftime('%Y-%m-%d') else: cur_end_date = start_date.strftime('%Y-%m-%d') if analyse_type == 'title': cur.execute( "SELECT id, title FROM news WHERE time between '%s' and '%s' order by time, title" % (cur_date, cur_end_date)) else: cur.execute( "SELECT id, content FROM news WHERE time between '%s' and '%s' order by time, content" % (cur_date, cur_end_date)) news = cur.fetchall() news_keyword = [] for n in news: i += 1 print(i) if analyse_type == 'tfidf': kword = tfidf( n[1], allowPOS=['n', 'nr', 'ns', 'nt', 'nz', 'vn', 'v']) kword = " ".join(kword) elif analyse_type == 'textrank': kword = textrank( n[1], allowPOS=['n', 'nr', 'ns', 'nt', 'nz', 'vn', 'v']) kword = " ".join(kword) elif analyse_type == 'all': kword = cls.jiebafenci(n[1]) elif analyse_type == 'title': kword = cls.jiebafenci(n[1]) else: kword = '' keywords = [str(n[0]), kword.strip()] news_keyword.append(keywords) pd.DataFrame(news_keyword).to_csv(origin_data_path, index=False, header=False, mode="a", encoding="utf-8")
def stock_lstm_softmax(self, basic_path=None, train_file=None, model_file=None, log_folder=None, pre_file=None): """ 使用LSTM处理股票数据 分类预测 """ if train_file is None or model_file is None or log_folder is None or pre_file is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) train_path = os.path.join(basic_path, train_file) model_path = os.path.join(basic_path, model_file) log_path = os.path.join(basic_path, log_folder) pre_path = os.path.join(basic_path, pre_file) VTool.makeDirs(files=[model_path, pre_path], folders=[log_path]) tf.reset_default_graph() #给batch_size赋值 self.batch_size = 20 test_part = 0.1 self.train_size, self.test_size = VTool.initCsvTrainAndTest(basic_path=basic_path, input_file=train_file, batch_size=self.batch_size, test_part=test_part) #学习率 learning_rate = 0.001 #喂数据给LSTM的原始数据有几行,即:一次希望LSTM能“看到”多少个交易日的数据 origin_data_row = 3 #喂给LSTM的原始数据有几列,即:日线数据有几个元素 origin_data_col = 8+20 #LSTM网络有几层 layer_num = 1 #LSTM网络,每层有几个神经元 cell_num = 256 #最后输出的数据维度,即:要预测几个数据,该处需要处理分类问题,按照自己设定的类型数量设定 output_num = 3 #每次给LSTM网络喂多少行经过处理的股票数据。该参数依据自己显卡和网络大小动态调整,越大 一次处理的就越多,越能占用更多的计算资源 batch_size = tf.placeholder(tf.int32, []) #输入层、输出层权重、偏置。 #通过这两对参数,LSTM层能够匹配输入和输出的数据 W = { 'in':tf.Variable(tf.truncated_normal([origin_data_col, cell_num], stddev = 1), dtype = tf.float32), 'out':tf.Variable(tf.truncated_normal([cell_num, output_num], stddev = 1), dtype = tf.float32) } bias = { 'in':tf.Variable(tf.constant(0.1, shape=[cell_num,]), dtype = tf.float32), 'out':tf.Variable(tf.constant(0.1, shape=[output_num,]), dtype = tf.float32) } #告诉LSTM网络,即将要喂的数据是几行几列 #None的意思就是喂数据时,行数不确定交给tf自动匹配 #我们喂得数据行数其实就是batch_size,但是因为None这个位置tf只接受数字变量,而batch_size是placeholder定义的Tensor变量,表示我们在喂数据的时候才会告诉tf具体的值是多少 input_x = tf.placeholder(tf.float32, [None, origin_data_col * origin_data_row]) input_y = tf.placeholder(tf.float32, [None, output_num]) #处理过拟合问题。该值在其起作用的层上,给该层每一个神经元添加一个“开关”,“开关”打开的概率是keep_prob定义的值,一旦开关被关了,这个神经元的输出将被“阻断”。这样做可以平衡各个神经元起作用的重要性,杜绝某一个神经元“一家独大”,各种大佬都证明这种方法可以有效减弱过拟合的风险。 keep_prob = tf.placeholder(tf.float32, []) #通过reshape将输入的input_x转化成2维,-1表示函数自己判断该是多少行,列必须是origin_data_col #转化成2维 是因为即将要做矩阵乘法,矩阵一般都是2维的(反正我没见过3维的) input_x_after_reshape_2 = tf.reshape(input_x, [-1, origin_data_col]) #当前计算的这一行,就是输入层。输入层的激活函数是relu,并且施加一个“开关”,其打开的概率为keep_prob #input_rnn即是输入层的输出,也是下一层--LSTM层的输入 input_rnn = tf.nn.dropout(tf.nn.relu_layer(input_x_after_reshape_2, W['in'], bias['in']), keep_prob) #通过reshape将输入的input_rnn转化成3维 #转化成3维,是因为即将要进入LSTM层,接收3个维度的数据。粗糙点说,即LSTM接受:batch_size个,origin_data_row行cell_num列的矩阵,这里写-1的原因与input_x写None一致 input_rnn = tf.reshape(input_rnn, [-1, origin_data_row, cell_num]) #定义一个带着“开关”的LSTM单层,一般管它叫细胞 def lstm_cell(): cell = rnn.LSTMCell(cell_num, reuse=tf.get_variable_scope().reuse) return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob) #这一行就是tensorflow定义多层LSTM网络的代码 lstm_layers = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(layer_num)], state_is_tuple = True) #初始化LSTM网络 init_state = lstm_layers.zero_state(batch_size, dtype = tf.float32) #使用dynamic_rnn函数,告知tf构建多层LSTM网络,并定义该层的输出 outputs, state = tf.nn.dynamic_rnn(lstm_layers, inputs = input_rnn, initial_state = init_state, time_major = False) h_state = state[-1][1] #该行代码表示了输出层 #将LSTM层的输出,输入到输出层(输出层带softmax激活函数),输出为各个分类的概率 #假设有3个分类,那么输出举例为:[0.001, 0.992, 0.007],表示第1种分类概率千分之1,第二种99.2%, 第三种千分之7 y_pre = tf.nn.softmax(tf.matmul(h_state, W['out']) + bias['out']) #损失函数,用作指导tf #loss定义为交叉熵损失函数,softmax输出层大多都使用的这个损失函数。关于该损失函数详情可以百度下 loss = -tf.reduce_mean(input_y * tf.log(y_pre)) #告诉tf,它需要做的事情就是就是尽可能将loss减小 #learning_rate是减小的这个过程中的参数。如果将我们的目标比喻为“从北向南走路走到菜市场”,我理解的是 #learning_rate越大,我们走的每一步就迈的越大。初看似乎步子越大越好,但是其实并不能保证每一步都是向南走 #的,有可能因为训练数据的原因,导致我们朝西走了一大步。或者我们马上就要到菜市场了,但是一大步走过去,给 #走过了。。。综上,这个learning_rate(学习率参数)的取值,无法给出一个比较普适的,还是需要根据实际情况去 #尝试和调整。0.001的取值是tf给的默认值 #上述例子是个人理解用尽可能通俗易懂地语言表达。如有错误,欢迎指正 train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #这块定义了一个新的值,用作展示训练的效果 #它的定义为:预测对的 / 总预测数,例如:0.55表示预测正确了55% correct_prediction = tf.equal(tf.argmax(y_pre, 1), tf.argmax(input_y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) #用以保存参数的函数(跑完下次再跑,就可以直接读取上次跑的结果而不必从头开始) saver = tf.train.Saver(tf.global_variables()) #tf要求必须如此定义一个init变量,用以在初始化运行(也就是没有保存模型)时加载各个变量 init = tf.global_variables_initializer() #设置tf按需使用GPU资源,而不是有多少就占用多少 config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config = config) #使用with,保证执行完后正常关闭tf with sess and open (pre_path, "w") as f: try: #定义了存储模型的文件路径,即:当前运行的python文件路径下,文件名为stock_rnn.ckpt saver.restore(sess, model_save_path) print ("成功加载模型参数") except: #如果是第一次运行,通过init告知tf加载并初始化变量 print ("未加载模型参数,文件被删除或者第一次运行") sess.run(init) i = 0 while True: train_x, train_y = self.get_train_softmax(file_path = train_path, time_step = origin_data_row, rtype="train") if train_x is None: print ("训练集均已训练完毕") saver.save(sess, model_path) print("保存模型\n") break if (i + 1) % 10 == 0: train_accuracy = sess.run(accuracy, feed_dict={ input_x:train_x, input_y: train_y, keep_prob: 1.0, batch_size: self.batch_size}) print ("step: {0}, training_accuracy: {1}".format(i + 1, train_accuracy)) saver.save(sess, model_path) print("保存模型\n") #这部分代码作用为:每次保存模型,顺便将预测收益和真实收益输出保存至show_y_softmax.txt文件下。熟悉tf可视化,完全可以采用可视化替代 _y_pre_train = sess.run(y_pre, feed_dict={ input_x: train_x, input_y: train_y, keep_prob: 1.0, batch_size: self.batch_size}) _loss = sess.run(loss, feed_dict={ input_x:train_x, input_y: train_y, keep_prob: 1.0, batch_size: self.batch_size}) a1 = np.array(train_y).reshape(self.batch_size, output_num) b1 = np.array(_y_pre_train).reshape(self.batch_size, output_num) f.write(str(a1.tolist())) f.write("\n") f.write(str(b1.tolist())) f.write("\n") f.write(str(_loss)) f.write("\n") i += 1 #按照给定的参数训练一次LSTM神经网络 sess.run(train_op, feed_dict={input_x: train_x, input_y: train_y, keep_prob: 0.6, batch_size: self.batch_size}) #计算测试数据的准确率 #读取测试集数据 test_x, test_y = self.get_train_softmax(file_path = train_path, time_step = origin_data_row, rtype="test") print ("test accuracy {0}".format(sess.run(accuracy, feed_dict={ input_x: test_x, input_y: test_y, keep_prob: 1.0, batch_size:self.test_size}))) self.init()
def make_train_csv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, time_step=3, word_count=20, stock_id_str=None, ranking_type='tfidf'): if cur == None or start_date == None or end_date == None or output_file == None or stock_id_str == None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if time_step < 0: time_step = 3 if word_count < 0: word_count = 20 if ranking_type not in ["tfidf", "textrank"]: ranking_type = "tfidf" output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) pd.DataFrame({"0":[], "1":[]}).to_csv(output_path, index=False) words = cls.getImportVocab(cur, count=20, ranking_type=ranking_type) word_count = len(words) for i in range(len(words)): words[i] = "'" + words[i] + "'" words_str = ",".join(words) del words cur.execute("SELECT count(*) as count FROM history WHERE stock_id in (%s) and date between '%s' and '%s' " % (stock_id_str, start_date, end_date)) count = cur.fetchall() count = count[0][0] stock_id_num = len(stock_id_str.split(",")) skip = 50 * stock_id_num slimit = 0 while slimit < count: cur.execute("SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id in (%s) and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id_str, start_date, end_date, 0 if slimit-stock_id_num < 0 else slimit-stock_id_num, skip if slimit-stock_id_num < 0 else skip+stock_id_num)) slimit += skip history_tt = cur.fetchall() history_t = [] for h in history_tt: history_t.append([int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9])]) del history_tt sdate = str(history_t[0][9]) edate = str(history_t[-1][9]) sdate = datetime.datetime.strptime(sdate,'%Y-%m-%d') sdate = (sdate - datetime.timedelta( days=(time_step+1) )).strftime('%Y-%m-%d') cur.execute("SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc" % (words_str, sdate, edate)) bindex = cur.fetchall() bindex_t = [] bindex_vec = 0 cur_date = None if len(bindex) > 0: cur_date = str(bindex[0][2]) bix = [] bix_item = [cur_date] if len(bindex) > 0: for bi in bindex: if str(bi[2]) != cur_date: cur_date = str(bi[2]) bix.append(bix_item) bix_item = [cur_date] bix_temp = json.loads(bi[1]) bix_item.append(bix_temp['all']['0']) bix.append(bix_item) del bindex bindex = {} for k in range(1,len(bix)): b_t = [] for kk in range(1,len(bix[k])): if int(bix[k][kk]) != 0 and int(bix[k-1][kk]) != 0: b_t.append(str(np.round(float(100 * (int(bix[k][kk]) / int(bix[k-1][kk]) - 1)), 2))) else: b_t.append(str(0.00)) bindex[bix[k][0]] = b_t del bix for i in range(len(history_t)): history_t[i] += bindex[history_t[i][9]] history_temp = [] for h in zip(*history_t): history_temp.append(h) history = {'stock_id':history_temp[0], 'opening':history_temp[1], 'closing':history_temp[2], 'difference':history_temp[3], 'percentage_difference':history_temp[4], 'lowest':history_temp[5], 'highest':history_temp[6], 'volume':history_temp[7], 'amount':history_temp[8], 'date':history_temp[9]} for i in range(10, 10+word_count): history["word%s" % (i-9)] = history_temp[i] del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by = ['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) for i in history.index: history.loc[i, 'rate'] = str(np.round(float(history['rate'][i]), 2)) #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_stock, time_step): if cls.groupby_skip == False: cls.groupby_skip = True return None print ("正在处理的股票代码:%06s"%data_stock.name) word_key_list = [] for i in range(1,word_count+1): word_key_list.append("word%s" % i) x = word_key_list + ["opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount"] #提取输入S列(对应train_x) data_temp_x = data_stock[x] #提取输出列(对应train_y) data_temp_y = data_stock[["rate", "date", "stock_id"]] data_res = [] for i in range(time_step - 1, len(data_temp_x.index) - 1): data_res.append( data_temp_x.iloc[i - time_step + 1: i + 1].values.reshape(1, time_step * (8+word_count)).tolist() + data_temp_y.iloc[i + 1].values.reshape(1,3).tolist() ) if len(data_res) != 0: pd.DataFrame(data_res).to_csv(output_path, index=False, header=False, mode="a") g_stock = history.groupby(by = ["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock.apply(func_train_data, time_step = time_step)
res_file = os.path.join(basic_path, choose_stock_folder, model_folder, "ten-fold-ten-times.csv") # 读入情况 columns = [] for i in range(len(test_part_array) - 1): columns.append(str(i)) csv_res = {} if is_reload or not os.path.exists(res_file): for i in range(len(test_part_array) - 1): csv_res[str(i)] = [] for j in range(test_part_times): csv_res[str(i)].append([]) VTool.makeDirs(files=[res_file]) pd.DataFrame(csv_res).to_csv(res_file, index=False, columns=columns) csv_res = pd.read_csv(res_file) array_start = len(test_part_array) - 1 times_start = test_part_times res = {} find = False for i in csv_res: res[i] = csv_res[i].apply(eval).values if not find: for j in range(len(res[i])): if len(res[i][j]) == 0: array_start = int(i)
def train_rate(self, basic_path=None, data_file=None, model_folder=None, folder_extra="", input_type="origin", word_count=0, input_size=8, batch_size=30, time_step=10, reduce_num=0, test_part_start=0.9, test_part_end=1, times=50): if data_file is None or model_folder is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(basic_path, data_file) model_path = os.path.join(basic_path, model_folder) VTool.makeDirs(folders=[model_path]) f = open(data_path) df = pd.read_csv(f) f.close() columns = self.getColumns(input_type=input_type, word_count=word_count, res_type="rate") data = df[columns].values rnn_unit = 8 #单元数量 output_size = 2 tf.reset_default_graph() X = tf.placeholder(tf.float32, shape=[None, time_step, input_size + word_count]) Y = tf.placeholder(tf.float32, shape=[None, time_step, output_size]) keep_prob = tf.placeholder(tf.float32) x_train, y_train = self.get_train_data( data, model_path, input_size + word_count, batch_size, time_step, reduce_num, test_part_start, test_part_end, { "input_type": input_type, "word_count": word_count, "input_size": input_size, "time_step": time_step, "rnn_unit": rnn_unit, "output_size": output_size }) pred, predictions, _ = self.lstm(X=X, keep_prob=keep_prob, rnn_unit=rnn_unit, input_size=input_size + word_count, output_size=output_size) global_step = tf.Variable(0) lr = 0.01 #learning_rate = tf.train.exponential_decay(0.01, global_step, decay_steps=len(x_train), decay_rate=0.95, staircase=True) #损失函数 loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=tf.reshape( Y, [-1, 2]))) train_op = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step) #train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step) saver = tf.train.Saver() y_input = tf.argmax(tf.reshape(Y, [-1, 2]), 1) correct_predictions = tf.equal(predictions, y_input) accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float")) checkpoint_dir = os.path.abspath( os.path.join(model_path, "checkpoints" + folder_extra)) checkpoint_prefix = os.path.join(checkpoint_dir, "model") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(times): j = 0 while j < len(x_train): j_end = j + batch_size if j + batch_size < len( x_train) else len(x_train) _, _loss = sess.run([train_op, loss], feed_dict={ X: x_train[j:j_end], Y: y_train[j:j_end], keep_prob: 0.8 }) j = j_end print("Number of iterations:", i, " loss:", _loss) if i % 10 == 0: print("保存模型:", saver.save(sess, checkpoint_prefix)) # _predictions = sess.run([predictions],feed_dict={X:x_test, keep_prob:1}) # _predictions = np.array(_predictions).reshape((-1, time_step)).tolist() # y_predict = [] # for p in _predictions: # y_predict.append(p[-1]) # all_num, right_num, all_accuracy = self.various_accuracy(output_size, y_test, y_predict) # print("All input_nums: {:g}, right_nums: {:g}, accuracy: {:g}".format(all_num, right_num, right_num/all_num)) # for a in all_accuracy: # print("input_nums: {:g}, pre_nums: {:g}, right_nums: {:g}, accuracy: {:g}".format(a[0], a[1], a[2], a[3])) print("保存模型:", saver.save(sess, checkpoint_prefix)) print("The train has finished")
def make_train_csv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, time_step=10, stock_id_str=None): """ 制作股票分类数据 orgin_data_path:原始数据存放路径 all_data_path:制作成可被算法接收的文件存放路径 """ #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or output_file is None or stock_id_str is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if time_step < 0: time_step = 10 output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) data = cur.execute( "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id in (%s) and date between '%s' and '%s' " % (stock_id_str, start_date, end_date)) data = cur.fetchall() if len(data) == 0: return None res = [] for d in data: res.append([ int(d[0]), int(d[1]), str(d[2]), float(d[3]), float(d[4]), float(d[5]), float(d[6]), float(d[7]), float(d[8]), float(d[9]), float(d[10]) ]) new_data = [] for d in zip(*res): new_data.append(d) origin_data = { 'id': new_data[0], 'stock_id': new_data[1], 'date': new_data[2], 'opening': new_data[3], 'closing': new_data[4], 'difference': new_data[5], 'percentage_difference': new_data[6], 'lowest': new_data[7], 'highest': new_data[8], 'volume': new_data[9], 'amount': new_data[10] } #读取原始数据,只保留需要使用的列 total_data = DataFrame(origin_data) #根据股票代码排序,相同的股票代码按照交易日期排序。 #inplace参数表示不需要返回排序后的结果,直接覆盖原变量即可 #error-按字符顺序排序,其中日期没有规格化,故导致排序错误 total_data.sort_values(by=['stock_id', 'date'], inplace=True) #根据股票代码分组 g_stock_num = total_data.groupby(by=["stock_id"]) #针对每一组股票,分别计算收益gate,其定义为:(下下一个交易日的开盘价 / 下一个交易日的开盘价) - 1 #对gate乘以100,使之变成百分比形式(0.09 -> 9,表示9%) #使用np.round函数保存两位小数,之后的数字丢弃(9.8346474 - > 9.83) total_data["gate"] = 100 * (g_stock_num.shift(0)["closing"] / g_stock_num.shift(1)["closing"] - 1) for i in total_data.index: total_data.loc[i, 'gate'] = str( np.round(float(total_data['gate'][i]), 2)) #重新调整列的顺序,为接下来处理成输入、输出形式做准备 total_data = total_data[[ "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "gate", "date", "stock_id" ]] #将调整列顺序后的代码,重新按照股票代码分组 g_stock_num = total_data.groupby(by=["stock_id"]) #拿time_step个交易日的数据(默认为60个交易日),进行标准化 def func_stand(data_one_stock_num, time_step): #通过apply进入函数内的数据,其股票名为data_one_stock_num.name,类型为pd.dataFrame #即,进入此函数的数据为所有名为data_one_stock_num.name的集合 #dataFrame.shape:(num , 11), num是这个股票出现的次数 for colu_name in data_one_stock_num.columns: if colu_name in ["gate", "date", "stock_id"]: continue #只针对输入数据进行标准化,标准化算法为: (原始数据 - 平均值) / 标准差 #这里每一次for循环,都拿出了1列数据,针对这一列进行标准化并覆盖原数据 data_one_stock_num[colu_name] = data_one_stock_num[colu_name] #data_one_stock_num[colu_name] = ((data_one_stock_num[colu_name] - data_one_stock_num[colu_name].rolling(time_step).mean())/data_one_stock_num[colu_name].rolling(time_step).std()) return data_one_stock_num #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_one_stock_num, time_step): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_one_stock_num.name) #提取输入列(对应train_x) data_temp_x = data_one_stock_num[[ "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount" ]] #提取输出列(对应train_y) data_temp_y = data_one_stock_num[["gate", "date", "stock_id"]] data_res = [] #for循环从time_step - 1开始,因为前time_step - 1个数据不满足time_step条件 #例如:time_step为60,即需要60个交易日的数据制成训练集的一个输入,但某只股票因为停牌等原因,只有50个交易日的数据。那么它就可以跳过了,不满足最低数目的要求 for i in range(time_step - 1, len(data_temp_x.index) - 1): data_res.append( data_temp_x.iloc[i - time_step + 1:i + 1].values.reshape(1, time_step * 8).tolist() + data_temp_y.iloc[i + 1].values.reshape(1, 3).tolist()) if len(data_res) != 0: #使用末尾添加的形式,将各个股票的数据依次添加进设定的路径中。 #index参数表示是否需添加一列序号,header表示是否需要添加列头,mode表示选择哪一种模型进行csv操作(类似于open函数的模型) pd.DataFrame(data_res).to_csv(output_path, index=False, header=False, mode="a") return data_one_stock_num #数据标准化 data_after_stand = g_stock_num.apply(func_stand, time_step=time_step) data_after_stand.dropna(inplace=True) #将数据转成训练集合的形式 g_stock_num = data_after_stand.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 pd.DataFrame({"0": [], "1": []}).to_csv(output_path, index=False) cls.groupby_skip = False g_stock_num.apply(func_train_data, time_step=time_step)
def makeEnsembleData(self, stock_folders=None): test_part_array = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] test_part_times = 10 is_reload = False folder_600104 = "600104/" #3 folder_601318 = "601318/" #5 folder_002230 = "002230/" #7 basic_path = self.basic_path stock_folders = stock_folders if stock_folders != None else [folder_600104] for choose_stock_folder in stock_folders: for i in range(len(test_part_array)-1): for j in range(test_part_times): folder_extra = '_' + str(i) + '_' + str(j) data_file = os.path.join(basic_path, choose_stock_folder, "ensemble/checkpoints" + folder_extra, "ensemble_data.csv") if os.path.exists(data_file) and not is_reload: continue else: VTool.makeDirs(files=[data_file]) data = [] # cnn-model-1 1 cst1 = CNNStockText() accuracy, profit, origin_profit, predictions, others = cst1.predict(basic_path=basic_path, input_file=choose_stock_folder+"cnn/title_data.csv", output_folder=choose_stock_folder+"cnn/title_run", word2vec_model="news_title_word2vec", filter_sizes=[3, 4, 5], folder_extra=folder_extra, reduce_num=21, test_part_start=test_part_array[0], test_part_end=test_part_array[-1]) data.append([accuracy, profit, origin_profit, predictions, others]) cst2 = CNNStockText() # cnn-model-2 2 accuracy, profit, origin_profit, predictions, others = cst2.predict(basic_path=basic_path, input_file=choose_stock_folder+"cnn/tfidf_text_data.csv", output_folder=choose_stock_folder+"cnn/text_run", word2vec_model="news_tfidf_word2vec", filter_sizes=[8, 9, 10], folder_extra=folder_extra, reduce_num=21, test_part_start=test_part_array[0], test_part_end=test_part_array[-1]) data.append([accuracy, profit, origin_profit, predictions, others]) # cnn-model-3 3 csn = CNNStockNumber() accuracy, profit, origin_profit, predictions, others = csn.predict(basic_path=basic_path, input_file=choose_stock_folder+"cnn/bindex_data.csv", output_folder=choose_stock_folder+"cnn/bindex_run", embedding_dim=3, filter_sizes=[2], folder_extra=folder_extra, reduce_num=21, test_part_start=test_part_array[0], test_part_end=test_part_array[-1]) data.append([accuracy, profit, origin_profit, predictions, others]) # cnn-model-4 4 accuracy, profit, origin_profit, predictions, others = csn.predict(basic_path=basic_path, input_file=choose_stock_folder+"cnn/news_stock_data_%s.csv" % i, output_folder=choose_stock_folder+"cnn/news_stock_run", embedding_dim=10, filter_sizes=[3, 4, 5], folder_extra=folder_extra, reduce_num=0, test_part_start=test_part_array[0], test_part_end=test_part_array[-1]) data.append([accuracy, profit, origin_profit, predictions, others]) # lstm-model-1 5 so = LSTMStockOrigin() accuracy, profit, origin_profit, predictions, others = so.predict_rate(basic_path=basic_path, data_file=choose_stock_folder+"lstm/stock_origin_data.csv", model_folder=choose_stock_folder+"lstm/origin_model", folder_extra=folder_extra, reduce_num=10, test_part_start=test_part_array[0], test_part_end=test_part_array[-1]) data.append([accuracy, profit, origin_profit, predictions, others]) # lstm-model-2 6 accuracy, profit, origin_profit, predictions, others = so.predict_rate(basic_path=basic_path, data_file=choose_stock_folder+"lstm/stock_bindex_data.csv", model_folder=choose_stock_folder+"lstm/bindex_model", folder_extra=folder_extra, reduce_num=10, test_part_start=test_part_array[0], test_part_end=test_part_array[-1]) data.append([accuracy, profit, origin_profit, predictions, others]) # lstm-model-3 7 accuracy, profit, origin_profit, predictions, others = so.predict_rate(basic_path=basic_path, data_file=choose_stock_folder+"lstm/stock_news_data_%s.csv" % i, model_folder=choose_stock_folder+"lstm/news_model", folder_extra=folder_extra, reduce_num=10, test_part_start=test_part_array[0], test_part_end=test_part_array[-1]) data.append([accuracy, profit, origin_profit, predictions, others]) ensemble_data = {} for d in data: for di in range(len(d[3])): if d[4][di][1] not in ensemble_data: ensemble_data[d[4][di][1]] = [d[4][di][1], d[4][di][0], []] ensemble_data[d[4][di][1]][2].append(d[3][di]) data_len = len(data) data = {} for k in ensemble_data: d = ensemble_data[k] if len(d[2]) == data_len: data[k] = d e_data = sorted(data.items(), key=lambda x: time.mktime(time.strptime(x[0], '%Y-%m-%d'))) data = {"date": [], "rate": [], "predictions": []} for d in e_data: data["date"].append(d[1][0]) data["rate"].append(d[1][1]) data["predictions"].append(d[1][2]) pd.DataFrame(data).to_csv(data_file, index=False, columns=["date", "rate", "predictions"]) del cst1, cst2, e_data, data gc.collect()
def makeOriginDataCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, stock_id=None): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or output_file is None or stock_id is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) data = cur.execute( "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) data = cur.fetchall() if len(data) == 0: return None res = [] for d in data: res.append([ int(d[0]), int(d[1]), str(d[2]), float(d[3]), float(d[4]), float(d[5]), float(d[6]), float(d[7]), float(d[8]), float(d[9]), float(d[10]) ]) new_data = [] for d in zip(*res): new_data.append(d) origin_data = { 'id': new_data[0], 'stock_id': new_data[1], 'date': new_data[2], 'opening': new_data[3], 'closing': new_data[4], 'difference': new_data[5], 'percentage_difference': new_data[6], 'lowest': new_data[7], 'highest': new_data[8], 'volume': new_data[9], 'amount': new_data[10] } #读取原始数据,只保留需要使用的列 total_data = DataFrame(origin_data) total_data.sort_values(by=['stock_id', 'date'], inplace=True) #根据股票代码分组 g_stock_num = total_data.groupby(by=["stock_id"]) total_data["rate"] = 100 * (g_stock_num.shift(0)["closing"] / g_stock_num.shift(1)["closing"] - 1) for i in total_data.index: total_data.loc[i, 'rate'] = str( np.round(float(total_data['rate'][i]), 2)) #重新调整列的顺序,为接下来处理成输入、输出形式做准备 columns = [ "stock_id", "date", "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "rate" ] total_data = total_data[columns] def func_train_data(data_one_stock_num): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_one_stock_num.name) data = { "stock_id": [], "date": [], "opening": [], "closing": [], "difference": [], "percentage_difference": [], "lowest": [], "highest": [], "volume": [], "amount": [], "rate": [] } for i in range(len(data_one_stock_num.index) - 1): for k in data: data[k].append(data_one_stock_num.iloc[i][k]) pd.DataFrame(data).to_csv(output_path, index=False, columns=columns) total_data1 = total_data.dropna() total_data2 = total_data1.drop( total_data1[(total_data1.rate == 'nan')].index) g_stock_num = total_data2.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock_num.apply(func_train_data)