示例#1
0
def generate_feature_vector():
    logger.info("In Generate Feature Vector...")
    prepare_feature()
    # 设置标题
    title_list = list(featureDict.keys())
    title_list.append('TARGET')
    featureVectorList.append(title_list)
    feature_size = len(featureDict.keys())
    global newsFeatureList
    newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH)
    global processedPriceList
    file_path = PROCESSED_PRICE_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    processedPriceList = CommonUtil.read_csv(file_path)
    # 新闻从20160630开始到20171229,价格从20160701开始到20171229
    last_news_begin = 0
    news_feature_begin_index = last_news_begin
    pre_price_item = list()
    pre_price_item.append(PRICE_START_TIME)
    pre_price_item.append(0)
    price_start_time = CommonUtil.get_datetime_from_string(PRICE_START_TIME)
    price_end_time = CommonUtil.get_datetime_from_string(PRICE_END_TIME)
    # 将闭市时间内的新闻统一设置为开市前NEWS_INFLUENCE_MOST分钟时发生的
    for news_index in range(0, len(newsFeatureList)):
        news_feature = newsFeatureList[news_index]
        news_time = news_feature[0]
        # 重设新闻时间
        news_feature[0] = CommonUtil.\
            reset_news_time(news_time, NEWS_INFLUENCE_MOST, MARKET_OPEN_TIME, MARKET_CLOSE_TIME)
        newsFeatureList[news_index] = news_feature
    for current_price_item in processedPriceList:
        current_price_time = CommonUtil.get_datetime_from_string(
            current_price_item[0])
        if price_start_time <= current_price_time < price_end_time:
            # 计算价格的变化
            price_delta = round(
                (float(current_price_item[1]) - float(pre_price_item[1])) *
                FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION)
            pre_price_time = CommonUtil.get_datetime_from_string(
                pre_price_item[0])
            logger.debug(current_price_time)
            # 计算pre_price_time到current_price_time新闻的作用总和
            # last_interval_minutes >= 1
            last_interval_minutes = int(
                CommonUtil.get_interval_seconds(current_price_time,
                                                pre_price_time) / 60)
            influence_feature_vector = [0.0] * feature_size
            # 对两个价格之间的每个采样点计算新闻的影响
            is_influenced_price = False
            for minute_i in range(0, last_interval_minutes):
                # 计算的时刻点,pre_price_time之后的时刻点,包括current_price_time
                time_i = CommonUtil.get_minute_changed(pre_price_time,
                                                       minute_i + 1)
                # 该时刻点受到影响对应的新闻
                for news_feature_begin_index in range(last_news_begin,
                                                      len(newsFeatureList)):
                    interval_seconds = CommonUtil.get_interval_seconds(
                        time_i,
                        CommonUtil.get_datetime_from_string(
                            newsFeatureList[news_feature_begin_index][0]))
                    # 如果有新闻在影响范围内
                    if 0 <= interval_seconds <= NEWS_INFLUENCE_DACAY_THRESHOLD * 60:
                        for news_feature_end_index in range(
                                news_feature_begin_index,
                                len(newsFeatureList)):
                            if CommonUtil.get_datetime_from_string(newsFeatureList[news_feature_end_index][0]) \
                                    > time_i:
                                break
                        str_begin_end = str(minute_i + 1) + ': news->' + str(
                            news_feature_begin_index) + ' : ' + str(
                                news_feature_end_index - 1)
                        logger.debug(str_begin_end)
                        for news_feature_index in range(
                                news_feature_begin_index,
                                news_feature_end_index):
                            current_news_feature = newsFeatureList[
                                news_feature_index]
                            influence_score = decay_influence(
                                CommonUtil.get_datetime_from_string(
                                    current_news_feature[0]), time_i)
                            for value_i in range(0, feature_size):
                                influence_feature_vector[value_i] += float(current_news_feature[value_i + 1]) \
                                                                     * influence_score
                        is_influenced_price = True
                        break
                    elif interval_seconds < 0:
                        break
                last_news_begin = news_feature_begin_index
            if is_influenced_price:
                influence_feature_vector.append(price_delta)
                featureVectorList.append(influence_feature_vector)
        pre_price_item = current_price_item
    file_path = FEATURE_VECTOR_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, featureVectorList)
    logger.info("Generate Feature Vector Done!")