def GetTrainData(): GetTrainDataTempSave() features = np.load('./data/dataset/fix_dataset_temp_features.npy') labels = np.load('./data/dataset/fix_dataset_temp_labels.npy') train_data = np.append(features, labels, axis=1) print("train_data: {}".format(train_data.shape)) # raw_input("Enter ...") print("reorder...") order = np.argsort(np.random.random(len(train_data))) train_data = train_data[order] train_data = train_data[:2000000] # raw_input("Enter ...") # sample_train_data = train_data[:10] print("get feature ...") train_features = train_data[:, 0:feature.FEATURE_SIZE()].copy() # raw_input("Enter ...") print("get label...") train_labels = train_data[:, feature.COL_ACTIVE_LABEL( ):feature.COL_ACTIVE_LABEL() + 1].copy() # raw_input("Enter ...") print("train_features: {}".format(train_features.shape)) print("train_labels: {}".format(train_labels.shape)) return train_features, train_labels
def GetTrainDataTempSave(): dataset = np.load(FileNameFixDataSet()) print("dataset: {}".format(dataset.shape)) pos = dataset[:, feature.COL_TRADE_DATE(0)] < dataset_train_test_split_date train_data = dataset[pos] print("train_data: {}".format(train_data.shape)) features = train_data[:, 0:feature.FEATURE_SIZE()] labels = train_data[:, feature.COL_ACTIVE_LABEL():feature.COL_ACTIVE_LABEL() + 1] np.save('./data/dataset/fix_dataset_temp_features.npy', features) np.save('./data/dataset/fix_dataset_temp_labels.npy', labels)
def GetTrainTestDataSplitByDate(): dataset = np.load(FileNameFixDataSet()) print("dataset: {}".format(dataset.shape)) pos = dataset[:, feature.COL_TRADE_DATE(0)] < dataset_train_test_split_date train_data = dataset[pos] test_data = dataset[~pos] print("train: {}".format(train_data.shape)) print("test: {}".format(test_data.shape)) train_features = train_data[:, 0:feature.FEATURE_SIZE()] train_labels = train_data[:, feature.COL_ACTIVE_LABEL( ):feature.COL_ACTIVE_LABEL() + 1] test_features = test_data[:, 0:feature.FEATURE_SIZE()] test_labels = test_data[:, feature.COL_ACTIVE_LABEL( ):feature.COL_ACTIVE_LABEL() + 1] return train_features, train_labels, test_features, test_labels, test_data
def GetTrainTestDataSampleByDate(test_ratio): sample_num = int(1.0 / test_ratio + 0.0001) dataset = np.load(FileNameFixDataSet()) print("dataset: {}".format(dataset.shape)) pos = ((dataset[:, feature.COL_TRADE_DATE(0)].astype(int) % 100) % sample_num) == 0 test_data = dataset[pos] train_data = dataset[~pos] print("train: {}".format(train_data.shape)) print("test: {}".format(test_data.shape)) train_features = train_data[:, 0:feature.FEATURE_SIZE()] train_labels = train_data[:, feature.COL_ACTIVE_LABEL( ):feature.COL_ACTIVE_LABEL() + 1] test_features = test_data[:, 0:feature.FEATURE_SIZE()] test_labels = test_data[:, feature.COL_ACTIVE_LABEL( ):feature.COL_ACTIVE_LABEL() + 1] return train_features, train_labels, test_features, test_labels, test_data
def GetTrainTestDataRandom(test_ratio): sample_num = int(1.0 / test_ratio + 0.0001) dataset = np.load(FileNameFixDataSet()) print("dataset: {}".format(dataset.shape)) print('sample_num:%u' % sample_num) # 生成数值范围在 0-(sample_num-1)的随机数组,pos是值为0的位置 pos = (np.random.randint(0, sample_num, size=len(dataset)) == 0) test_data = dataset[pos] train_data = dataset[~pos] print("train: {}".format(train_data.shape)) print("test: {}".format(test_data.shape)) train_features = train_data[:, 0:feature.FEATURE_SIZE()] train_labels = train_data[:, feature.COL_ACTIVE_LABEL( ):feature.COL_ACTIVE_LABEL() + 1] test_features = test_data[:, 0:feature.FEATURE_SIZE()] test_labels = test_data[:, feature.COL_ACTIVE_LABEL( ):feature.COL_ACTIVE_LABEL() + 1] return train_features, train_labels, test_features, test_labels, test_data