Exemplo n.º 1
0
class DataLoader:
    def __init__(self, future_ws=50, win=1):
        self.future_ws = future_ws
        self.win = win
        self.r = Reader()
        self.dr = Dater()

    def Load(self, file_name):
        return self.Labelling(self.r.LoadOneDF(file_name))
        #return self.r.LoadOneDF(file_name)

    def Labelling(self, df, cut_tail=False):
        if len(df) == 0:
            print('labelling passed empty df in!')
            return df
        df = df[df['asks[0]'] - df['bids[0]'] > 0]
        min_move = (df['asks[0]'] - df['bids[0]']).min()
        self.label_map = {'Flat': 0, 'Buy': 1, 'Sell': 2}
        if 'label' in df.columns:
            print('df has labelled!')
            return df
        ask_min = list(
            map(
                lambda x: -x,
                GetSlipWindowMax((df['asks[0]'] * -1).tolist(),
                                 self.future_ws)))
        bid_max = GetSlipWindowMax(df['bids[0]'].tolist(), self.future_ws)
        df = df.drop([i for i in range(len(df) - self.future_ws, len(df))])
        ask_min = ask_min[self.future_ws:]
        bid_max = bid_max[self.future_ws:]
        #print('len(df)=%d, len(ask_min)=%d' % (len(df), len(bid_max)))
        df.insert(len(df.columns), 'label', 0)
        df.loc[df['bids[0]'] - ask_min >= self.win * min_move,
               'label'] = self.label_map['Sell']
        df.loc[bid_max - df['asks[0]'] >= self.win * min_move,
               'label'] = self.label_map['Buy']
        if cut_tail:
            df.loc[[i for i in range(500)], 'label'] = self.label_map['Flat']
            df.loc[[i for i in range(len(df) - 500, len(df))],
                   'label'] = self.label_map['Flat']
        #print('after labelling: %.1f%% are buy, %.1f%% are sell, %.1f%% are flat' %((df['label']==self.label_map['Buy']).mean()*100, (df['label']==self.label_map['Sell']).mean()*100, (df['label']==self.label_map['Flat']).mean()*100))
        return df

    def GetXY(self, file_name, cut_tail=True):
        start_time = time.time()
        df = self.Load(file_name)
        if len(df) == 0:
            print('GetXY loading empty df!')
            return np.array([[]]), np.array([])
        y = df['label'].tolist()
        del df['label']
        del df['time_sec']
        del df['ticker']
        start_time = time.time()
        x = rolled(df, self.future_ws, cut_tail)
        if cut_tail:
            y = y[self.future_ws:]
        print('GetXY cost %.2fs' % (time.time() - start_time))
        return x, y

    def GetListXY(self, file_list):
        if len(file_list) == 0:
            print('meet empty filelist')
            return [[]], []
        print('GetListXY started: %s-%s' % (file_list[0], file_list[-1]))
        start_time = time.time()
        for i, fl in enumerate(file_list):
            x, y = self.GetXY(fl)
            if len(y) > 0:
                file_list = file_list[i + 1:]
                break
        x, y = np.array(x), np.array(y)
        #print('init x,y shape is %s and %s' %(np.shape(x), np.shape(y)))
        for fl in file_list:
            #print('Getting XY from %s' % (fl))
            temp_x, temp_y = self.GetXY(fl)
            if len(temp_x) > 0 and len(temp_y) > 0:
                #x = np.append(x, temp_x, axis=0)
                x = np.vstack((x, temp_x))
                y = np.hstack((y, temp_y))
                #y = np.append(y, temp_y, axis=0)
        print('GetListXY finished: %.1f%% buy, %.1f%% sell, cost %.1fs' %
              ((y == 1).mean() * 100,
               (y == 2).mean() * 100, time.time() - start_time))
        return x, y

    def GetTrainTest(self,
                     con,
                     train_start,
                     train_end,
                     test_start,
                     test_end,
                     add_post=True):
        if train_end < train_start or test_end < test_start:
            print('GetTrainTest Failed! %s %s %s %s not accending' %
                  (train_start, train_end, test_start, test_end))
            sys.exit(1)
        train_date = self.dr.GetDateList(train_start, train_end)
        test_date = self.dr.GetDateList(test_start, test_end)
        train_list = []
        test_list = []
        for td in train_date:
            file_path = '/running/' + td + '/' + con + ("8888" if add_post else
                                                        '') + '.csv'
            print(file_path)
            if os.path.exists(file_path):
                train_list.append(file_path)
        for td in test_date:
            file_path = '/running/' + td + '/' + con + ('8888' if add_post else
                                                        '') + '.csv'
            if os.path.exists(file_path):
                test_list.append(file_path)
        print('find %d trainfile %d testfile' %
              (len(train_list), len(test_list)))
        return train_list, test_list

    def GetBatch(self, x, y, batch_size=128):
        start_index = np.random.randint(len(x) - self.future_ws,
                                        size=batch_size)
        rx, ry = [], []
        for si in start_index:
            #rx.append(x[si:si+self.future_ws])
            #ry.append(self.OneHot(y[si+self.future_ws-1]).tolist()[0])
            rx.append(x[si])
            #ry.append(self.OneHot(y[si]).tolist()[0])
            ry.append(self.LabelToClass(y[si]))
        return rx, ry

    def LabelToClass(self, y):
        r = []
        if isinstance(y, list):
            for l in y:
                if l == self.label_map['Flat']:
                    r.append([1, 0, 0])
                elif l == self.label_map['Buy']:
                    r.append([0.1, 0.9, 0])
                else:
                    r.append([0.1, 0, 0.9])
            return r
        if y == self.label_map['Flat']:
            return [1, 0, 0]
        elif y == self.label_map['Buy']:
            return [0.1, 0.9, 0]
        else:
            return [0.1, 0, 0.9]

    def OneHot(self, y, C=3):
        return np.eye(C)[np.array(y).reshape(-1)]
Exemplo n.º 2
0
import numpy as np
import pandas as pd
from Reader import *

r=Reader()
a = r.LoadOneDF('/running/2019-02-18/ni8888.csv')
r.Labelling(a)