def parser_for_hs300(start=settings.S_TRAIN_DATE, end=settings.Time_Slot): ''' from the data folder, get the truncate ts data and the average spread/vol ''' data_folder = os.path.join(settings.TXT_DATA_PATH, 'hs300') from pretreat_data.choose_folder import datetime2month months = datetime2month(start, end) month2name = lambda x: 'MarketData_000300_SH_' + x + '.csv' filenames = sorted([month2name(mon) for mon in months]) filenames = [os.path.join(data_folder, fn) for fn in filenames] tmp_data = [] for fn in filenames: if not os.path.isfile(fn): print 'Warning: %s does not existing, will skip it' %fn continue else: with open(fn) as f: for line in f: tmp_line = line.split(',') t = datetime.strptime(tmp_line[0][:15], "%Y%m%d %H%M%S") index_value = float(tmp_line[1]) tmp_data.append((t, index_value)) tmp_data = np.asarray(tmp_data) tmp_data = rolling.rolling_anal(start, end, settings.WINDOW_SIZE, tmp_data[:, 0], tmp_data[:, 1]) hs300_matrix = [(i, (j,)) for i, j in tmp_data] hs300_matrix = truncate(start, end, hs300_matrix) return hs300_matrix
def parser_for_stock(data_folders, start=settings.S_TRAIN_DATE, end=settings.Time_Slot, save_to_dict=False): ''' from the data folder, get the truncate ts data and the average spread/vol ''' ts_data_dict, score_dict = {}, {} stock_matrix = [] c = 0 for symbol in settings.SYMBOLS: print c, symbol tmp_timeline, tmp_ave, tmp_spread, tmp_vol = [], [], [], [] for folder in data_folders: try: filename = [os.path.join(folder, filename) for filename in os.listdir(folder) if symbol in filename][0] except IndexError: print "Warning: empty file of %s in the data folder %s" %(symbol, folder) continue # use txt reader test_t = datetime.strptime( open(filename).readline().split(',')[0][:15], "%Y%m%d %H%M%S" ) # initial test_t with open(filename) as f: for line in f: tmp_line = line.split(',') t = datetime.strptime(tmp_line[0][:15], "%Y%m%d %H%M%S") b1, s1, v = float(tmp_line[2]), float(tmp_line[12]), float(tmp_line[-1]) ave_price = (b1 + s1)/2. spread = (b1 - s1)/2. tmp_timeline.append(t) tmp_ave.append(ave_price) tmp_spread.append(spread) #print t, test_t if t.day > test_t.day: tmp_vol.append(cache_v) else: test_t = t; cache_v = v tmp_vol.append(cache_v) pass pass # truncate the raw data to the time series data tmp_data = np.vstack((tmp_timeline, tmp_ave)).T # rolling the data # NOTE: the start and end used here is meaningless, BUG, Lance, 2013/10/20 tmp_data = rolling.rolling_anal(start, end, settings.WINDOW_SIZE, tmp_data[:, 0], tmp_data[:, 1]) # save the numpy array to the dict if request if save_to_dict: ts_data_dict[symbol] = tmp_data # convert to the tuple, make it faster for binding tmp_data = [(a[0], tuple(a[1:])) for a in tmp_data] if c == 0: stock_matrix = tmp_data else: if tmp_data != []: stock_matrix = logic.bind(stock_matrix, tmp_data) else: stock_matrix = [(i,j+('NA',)) for i, j in stock_matrix] if len(tmp_data) != 0: # here will use the redundant values of spread and # total prices as the ranking criterian, 2013/10/19 ave_spread = np.average(tmp_spread) ave_vol = np.average(tmp_vol) score_dict[symbol] = (ave_spread, ave_vol) else: score_dict[symbol] = (None, None) c = c+1 # selecting symbols and prepare the matrix selected_index = naive_ranking.naive_ranking(score_dict) raw_stock_matrix = truncate(start, end, stock_matrix) stock_matrix = [] for t, s in raw_stock_matrix: tmp_list = [] for i in selected_index: tmp_list.append(s[i]) stock_matrix.append((t, tuple(tmp_list))) matrix_title = [] for i in selected_index: matrix_title.append(settings.SYMBOLS[i]) return stock_matrix, matrix_title
def parser_for_prediction(data_folders, selected_symbols, start=settings.S_TRAIN_DATE, end=settings.Time_Slot, save_to_dict=False): ''' from the data folder, get the truncate ts data and the average spread/vol ''' ts_data_dict, score_dict = {}, {} stock_matrix = [] c = 0 for symbol in selected_symbols: print c, symbol tmp_timeline, tmp_ave, tmp_spread, tmp_vol = [], [], [], [] for folder in data_folders: try: filename = [os.path.join(folder, filename) for filename in os.listdir(folder) if symbol in filename][0] except IndexError: print "Warning: empty file of %s in the data folder %s" %(symbol, folder) continue # use txt reader test_t = datetime.strptime( open(filename).readline().split(',')[0][:15], "%Y%m%d %H%M%S" ) # initial test_t with open(filename) as f: for line in f: tmp_line = line.split(',') t = datetime.strptime(tmp_line[0][:15], "%Y%m%d %H%M%S") b1, s1, v = float(tmp_line[2]), float(tmp_line[12]), float(tmp_line[-1]) ave_price = (b1 + s1)/2. spread = (b1 - s1)/2. tmp_timeline.append(t) tmp_ave.append(ave_price) tmp_spread.append(spread) #print t, test_t if t.day > test_t.day: tmp_vol.append(cache_v) else: test_t = t; cache_v = v tmp_vol.append(cache_v) pass pass # truncate the raw data to the time series data tmp_data = np.vstack((tmp_timeline, tmp_ave)).T # rolling the data # NOTE: the start and end used here is meaningless, BUG, Lance, 2013/10/20 tmp_data = rolling.rolling_anal(start, end, settings.WINDOW_SIZE, tmp_data[:, 0], tmp_data[:, 1]) # save the numpy array to the dict if request if save_to_dict: ts_data_dict[symbol] = tmp_data # convert to the tuple, make it faster for binding tmp_data = [(a[0], tuple(a[1:])) for a in tmp_data] if c == 0: stock_matrix = tmp_data else: stock_matrix = logic.merge(stock_matrix, tmp_data) c = c+1 stock_matrix = truncate(start, end, stock_matrix) return stock_matrix