Exemplo n.º 1
0
def get_feature_value():
    feature_vector_list = list()
    feature_name_list = list()
    for feature_name in FEATURE_NAME_LIST:
        feature_name_list.append(feature_name)
        feature_list = CommonUtil.read_csv(MarketDataCrawler.MARKET_DATA_PATH +
                                           '/' + feature_name + '.csv')
        feature_dict = dict()
        for feature_i in range(1, len(feature_list)):
            date = CommonUtil.get_datetime_from_string_(
                feature_list[feature_i][0]).date()
            # 开盘价
            feature_value = float(feature_list[feature_i][1])
            feature_dict[date] = feature_value
        for value_key in valueDict.keys():
            if value_key in feature_dict.keys():
                feature_value = feature_dict[value_key]
            else:
                feature_value = 'N/A'
            if value_key in featureValue.keys():
                feature_items = featureValue[value_key]
            else:
                feature_items = list()
            feature_items.append(feature_value)
            featureValue[value_key] = feature_items
    feature_name_list.append(VALUE_NAME)
    feature_name_list.insert(0, 'DATE')
    feature_vector_list.append(feature_name_list)
    for key in featureValue.keys():
        feature_items = featureValue[key]
        feature_items.append(valueDict[key])
        feature_items.insert(0, key)
        feature_vector_list.append(feature_items)
    CommonUtil.write_csv('../files/marketdata/FEATURE_VECTOR.csv',
                         feature_vector_list)
Exemplo n.º 2
0
def get_value_list():
    value_list = CommonUtil.read_csv(MarketDataCrawler.MARKET_DATA_PATH + '/' +
                                     VALUE_NAME + '.csv')
    for value_i in range(1, len(value_list)):
        date = CommonUtil.get_datetime_from_string_(
            value_list[value_i][0]).date()
        # 收盘价
        value = float(value_list[value_i][2])
        valueDict[date] = value
Exemplo n.º 3
0
def adjust_feature_vector():
    feature_vector_list = CommonUtil.read_csv(
        '../files/marketdata/FEATURE_VECTOR.csv')
    pre_item = feature_vector_list[0]
    current_item = pre_item
    for vector_i in range(1, len(feature_vector_list)):
        current_item = feature_vector_list[vector_i]
        for i in range(1, len(current_item)):
            if current_item[i] == 'N/A':
                current_item[i] = pre_item[i]
        feature_vector_list[vector_i] = current_item
        pre_item = current_item
    CommonUtil.write_csv('../files/marketdata/ADJUSTED_FEATURE_VECTOR.csv',
                         feature_vector_list)
Exemplo n.º 4
0
def reduce_feature_vector():
    logger.info("In Reduce Feature Vector...")
    prepare_feature()
    origin_feature_num = len(featureDict.keys())
    global featureVectorList
    reduced_feature_vector_list = list()
    feature_list = list()
    feature_count_threshold = 2
    file_path = FEATURE_VECTOR_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    featureVectorList = CommonUtil.read_csv(file_path)
    feature_count_dict = dict()
    feature_count_list = [0] * origin_feature_num
    is_title = True
    for feature_vector in featureVectorList:
        if is_title:
            is_title = False
        else:
            for feature_value_index in range(0, origin_feature_num):
                if feature_vector[feature_value_index] != '0.0':
                    feature_count_list[feature_value_index] += 1
    feature_index = 0
    for key in featureDict.keys():
        feature_count = feature_count_list[feature_index]
        feature_count_dict[key] = feature_count
        if feature_count >= feature_count_threshold:
            feature_list.append(feature_index)
        feature_index += 1
    logger.info(str('Reduce Feature Vector to: ' + str(len(feature_list))))
    feature_list.append(origin_feature_num)
    # 拼装计数超过阈值的特征向量
    for feature_vector in featureVectorList:
        reduced_feature_vector = list()
        for feature_value_index in range(0, origin_feature_num + 1):
            if feature_value_index in feature_list:
                try:
                    reduced_feature_vector.append(
                        feature_vector[feature_value_index])
                except IndexError:
                    logger.error(feature_vector)
                    logger.error(feature_value_index)
        reduced_feature_vector_list.append(reduced_feature_vector)
    file_path = REDUCED_FEATURE_VECTOR_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, reduced_feature_vector_list)
    logger.info("Reduce Feature Vector Done!")
Exemplo n.º 5
0
def feature_col_count():
    logger.info("In Count Feature Appear...")
    prepare_feature()
    feature_count_dict = dict()
    global newsFeatureList
    newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH)
    feature_count_list = [0] * len(featureDict.keys())
    for feature_vector in newsFeatureList:
        feature_index = 0
        for feature_value_index in range(1, len(feature_vector)):
            if feature_vector[feature_value_index] != '0':
                feature_count_list[feature_index] += 1
            feature_index += 1
    feature_index = 0
    for key in featureDict.keys():
        feature_count = feature_count_list[feature_index]
        feature_index += 1
        feature_count_dict[key] = feature_count
        row_item = key + "," + feature_count
        logger.info(row_item)
    logger.info("Count Feature Appear Done!")
Exemplo n.º 6
0
    print(dataNum)
    for i in range(dataNum - 1):
        if y[i] == 0:
            y[i] = 0
        elif y[i] < 0:
            y[i] = -1
        else:
            y[i] = 1
    return y


if __name__ == "__main__":
    # 加载样本数据集
    n = 120
    csv_file = CommonUtil.read_csv('../files/files_train/files_' + str(n) +
                                   'min/REDUCED_FEATURE_VECTOR_' + str(n) +
                                   '.csv')
    dataNum = len(csv_file)
    featureNum = len(csv_file[0]) - 1
    print("Dimension of feature", featureNum)
    dataMat = np.array(csv_file)
    X = dataMat[1:, 0:featureNum].astype(float)
    y = dataMat[1:, featureNum].astype(float)
    y = convert2class(y, dataNum)  # 转换为类别

    # 神经网络对数据尺度敏感,所以最好在训练前标准化,或者归一化,或者缩放到[-1,1]
    scaler = StandardScaler()  # 标准化转换
    scaler.fit(X)  # 训练标准化对象
    X = scaler.transform(X)  # 转换数据集
    # solver='lbfgs',  MLP的求解方法:L-BFGS 在小数据上表现较好,Adam 较为鲁棒,SGD在参数调整较优时会有最佳表现(分类效果与迭代次数);SGD标识随机梯度下降。
    # alpha:L2的参数:MLP是可以支持正则化的,默认为L2,具体参数需要调整
Exemplo n.º 7
0
    print(dataNum)
    for i in range(dataNum - 1):
        if y[i] == 0:
            y[i] = 0
        elif y[i] < 0:
            y[i] = -1
        else:
            y[i] = 1
    return y


if __name__ == "__main__":
    # 加载样本数据集
    n = 120
    csv_file = CommonUtil.read_csv('../files/files_train/files_' + str(n) +
                                   'min/REDUCED_FEATURE_VECTOR_' + str(n) +
                                   '.csv')
    dataNum = len(csv_file)
    featureNum = len(csv_file[0]) - 1
    print("Dimension of feature", featureNum)
    dataMat = np.array(csv_file)
    X = dataMat[1:, 0:featureNum].astype(float)
    y = dataMat[1:, featureNum].astype(float)
    y = convert2class(y, dataNum)  # 转换为类别

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)  # 数据集分割

    # 训练模型
    model = xgb.XGBClassifier(max_depth=3,
                              learning_rate=0.1,
Exemplo n.º 8
0
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from util import CommonUtil
from sklearn.model_selection import train_test_split


# #############################################################################
# 导入数据
# 加载数据集

file_dir = 'C:/Users/yuzhe/Desktop/OptionAnalysis/files/'
csv_file = CommonUtil.read_csv(file_dir + 'TestUSDIndex.csv')
dataNum = len(csv_file)
featureNum = len(csv_file[0])-2
print("特征的维度", featureNum)
dataMat = np.array(csv_file)
X = dataMat[1:, 1: featureNum].astype(float)
y = dataMat[1:, featureNum].astype(float)

'''
# 将y标签的增长率转化为增、跌、不变三种标签
for i in range(dataNum-1):
    if y[i] == 0: y[i] = 0
    elif y[i]<0: y[i] = -1
    else:y[i]= 1
'''
# 数据集分
Exemplo n.º 9
0
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from util import CommonUtil
# 加载样本数据集
if __name__ == '__main__':
    n = 180
    csv_file = CommonUtil.read_csv(
        'C:/Users/yuzhe/Desktop/OptionAnalysis/files/files_' + n.__str__() +
        'min/REDUCED_FEATURE_VECTOR_' + n.__str__() + '.csv')
    dataNum = len(csv_file)
    featureNum = len(csv_file[0]) - 1
    print("Dimension of feature", featureNum)
    dataMat = np.array(csv_file)
    X = dataMat[1:, 0:featureNum].astype(float)
    y = dataMat[1:, featureNum].astype(float)
    print(dataNum)
    for i in range(dataNum - 1):
        if y[i] == 0: y[i] = 0
        elif y[i] < 0: y[i] = -1
        else: y[i] = 1

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)  # 数据集分割
    pipe_scv = Pipeline([('scl', StandardScaler()),
                         ('clf', SVC(random_state=1))])

    param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
Exemplo n.º 10
0
def generate_feature_vector():
    logger.info("In Generate Feature Vector...")
    prepare_feature()
    # 设置标题
    title_list = list(featureDict.keys())
    title_list.append('TARGET')
    featureVectorList.append(title_list)
    feature_size = len(featureDict.keys())
    global newsFeatureList
    newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH)
    global processedPriceList
    file_path = PROCESSED_PRICE_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    processedPriceList = CommonUtil.read_csv(file_path)
    # 新闻从20160630开始到20171229,价格从20160701开始到20171229
    last_news_begin = 0
    news_feature_begin_index = last_news_begin
    pre_price_item = list()
    pre_price_item.append(PRICE_START_TIME)
    pre_price_item.append(0)
    price_start_time = CommonUtil.get_datetime_from_string(PRICE_START_TIME)
    price_end_time = CommonUtil.get_datetime_from_string(PRICE_END_TIME)
    # 将闭市时间内的新闻统一设置为开市前NEWS_INFLUENCE_MOST分钟时发生的
    for news_index in range(0, len(newsFeatureList)):
        news_feature = newsFeatureList[news_index]
        news_time = news_feature[0]
        # 重设新闻时间
        news_feature[0] = CommonUtil.\
            reset_news_time(news_time, NEWS_INFLUENCE_MOST, MARKET_OPEN_TIME, MARKET_CLOSE_TIME)
        newsFeatureList[news_index] = news_feature
    for current_price_item in processedPriceList:
        current_price_time = CommonUtil.get_datetime_from_string(
            current_price_item[0])
        if price_start_time <= current_price_time < price_end_time:
            # 计算价格的变化
            price_delta = round(
                (float(current_price_item[1]) - float(pre_price_item[1])) *
                FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION)
            pre_price_time = CommonUtil.get_datetime_from_string(
                pre_price_item[0])
            logger.debug(current_price_time)
            # 计算pre_price_time到current_price_time新闻的作用总和
            # last_interval_minutes >= 1
            last_interval_minutes = int(
                CommonUtil.get_interval_seconds(current_price_time,
                                                pre_price_time) / 60)
            influence_feature_vector = [0.0] * feature_size
            # 对两个价格之间的每个采样点计算新闻的影响
            is_influenced_price = False
            for minute_i in range(0, last_interval_minutes):
                # 计算的时刻点,pre_price_time之后的时刻点,包括current_price_time
                time_i = CommonUtil.get_minute_changed(pre_price_time,
                                                       minute_i + 1)
                # 该时刻点受到影响对应的新闻
                for news_feature_begin_index in range(last_news_begin,
                                                      len(newsFeatureList)):
                    interval_seconds = CommonUtil.get_interval_seconds(
                        time_i,
                        CommonUtil.get_datetime_from_string(
                            newsFeatureList[news_feature_begin_index][0]))
                    # 如果有新闻在影响范围内
                    if 0 <= interval_seconds <= NEWS_INFLUENCE_DACAY_THRESHOLD * 60:
                        for news_feature_end_index in range(
                                news_feature_begin_index,
                                len(newsFeatureList)):
                            if CommonUtil.get_datetime_from_string(newsFeatureList[news_feature_end_index][0]) \
                                    > time_i:
                                break
                        str_begin_end = str(minute_i + 1) + ': news->' + str(
                            news_feature_begin_index) + ' : ' + str(
                                news_feature_end_index - 1)
                        logger.debug(str_begin_end)
                        for news_feature_index in range(
                                news_feature_begin_index,
                                news_feature_end_index):
                            current_news_feature = newsFeatureList[
                                news_feature_index]
                            influence_score = decay_influence(
                                CommonUtil.get_datetime_from_string(
                                    current_news_feature[0]), time_i)
                            for value_i in range(0, feature_size):
                                influence_feature_vector[value_i] += float(current_news_feature[value_i + 1]) \
                                                                     * influence_score
                        is_influenced_price = True
                        break
                    elif interval_seconds < 0:
                        break
                last_news_begin = news_feature_begin_index
            if is_influenced_price:
                influence_feature_vector.append(price_delta)
                featureVectorList.append(influence_feature_vector)
        pre_price_item = current_price_item
    file_path = FEATURE_VECTOR_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, featureVectorList)
    logger.info("Generate Feature Vector Done!")
Exemplo n.º 11
0
def process_original_price():
    logger.info("In Process Original Price...")
    global originalPriceList
    originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH)
    sample_datetime = None
    sample_price_list = list()
    # 对每一个原始价格
    for original_price in originalPriceList:
        logger.debug('price time: ' + original_price[0])
        price_datetime = CommonUtil.get_datetime_from_string(original_price[0])
        price_value = float(original_price[1])
        if sample_datetime is None:
            sample_datetime = CommonUtil.get_datetime_from_string(
                PRICE_START_TIME)
        time_interval = CommonUtil.get_interval_seconds(
            price_datetime, sample_datetime)
        # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格
        if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2:
            continue
        # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点
        while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2:
            # 如果当前采样点有价格
            if len(sample_price_list) > 0:
                price_sum = 0
                for price_item in sample_price_list:
                    price_sum += price_item
                average_price = round(price_sum / len(sample_price_list),
                                      CURRENCY_PAIR_PRECISION + 2)
                sample_datetime_str = CommonUtil.get_string_from_datetime(
                    sample_datetime)
                average_price_item = [sample_datetime_str, average_price]
                # 将采样时间及对应的计算后的价格加入列表
                processedPriceList.append(average_price_item)
                # 重置采样点价格列表
                sample_price_list = list()
            # 计算下一个采样点
            sample_datetime = CommonUtil.get_next_sample_time(
                sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME,
                MARKET_CLOSE_TIME)
            time_interval = CommonUtil.get_interval_seconds(
                price_datetime, sample_datetime)
        logger.debug('sample datetime:' +
                     CommonUtil.get_string_from_datetime(sample_datetime))
        # 价格时间在采集区间外
        if sample_datetime > CommonUtil.get_datetime_from_string(
                PRICE_END_TIME):
            break
        # 属于当前采样点,加入当前采样点价格列表,前闭后开[,)
        sample_price_list.append(price_value)
    # 处理最后一个采集时刻的价格列表
    # 如果当前采样点有价格
    if len(sample_price_list) > 0:
        price_sum = 0
        for price_item in sample_price_list:
            price_sum += price_item
        average_price = round(price_sum / len(sample_price_list),
                              CURRENCY_PAIR_PRECISION + 2)
        sample_datetime_str = CommonUtil.get_string_from_datetime(
            sample_datetime)
        average_price_item = [sample_datetime_str, average_price]
        # 将采样时间及对应的计算后的价格加入列表
        processedPriceList.append(average_price_item)
    file_path = PROCESSED_PRICE_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, processedPriceList)
    logger.info("Process Original Price Done!")
Exemplo n.º 12
0
def read_segmented_news():
    logger.info("In Read Segmented News...")
    global newsSegmentationList
    newsSegmentationList = CommonUtil.read_csv(SEGMENTED_NEWS_PATH)
    logger.info("Read Segmented News Done!")