def generate_feature_vector(): logger.info("In Generate Feature Vector...") prepare_feature() # 设置标题 title_list = list(featureDict.keys()) title_list.append('TARGET') featureVectorList.append(title_list) feature_size = len(featureDict.keys()) global newsFeatureList newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH) global processedPriceList file_path = PROCESSED_PRICE_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX processedPriceList = CommonUtil.read_csv(file_path) # 新闻从20160630开始到20171229,价格从20160701开始到20171229 last_news_begin = 0 news_feature_begin_index = last_news_begin pre_price_item = list() pre_price_item.append(PRICE_START_TIME) pre_price_item.append(0) price_start_time = CommonUtil.get_datetime_from_string(PRICE_START_TIME) price_end_time = CommonUtil.get_datetime_from_string(PRICE_END_TIME) # 将闭市时间内的新闻统一设置为开市前NEWS_INFLUENCE_MOST分钟时发生的 for news_index in range(0, len(newsFeatureList)): news_feature = newsFeatureList[news_index] news_time = news_feature[0] # 重设新闻时间 news_feature[0] = CommonUtil.\ reset_news_time(news_time, NEWS_INFLUENCE_MOST, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) newsFeatureList[news_index] = news_feature for current_price_item in processedPriceList: current_price_time = CommonUtil.get_datetime_from_string( current_price_item[0]) if price_start_time <= current_price_time < price_end_time: # 计算价格的变化 price_delta = round( (float(current_price_item[1]) - float(pre_price_item[1])) * FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION) pre_price_time = CommonUtil.get_datetime_from_string( pre_price_item[0]) logger.debug(current_price_time) # 计算pre_price_time到current_price_time新闻的作用总和 # last_interval_minutes >= 1 last_interval_minutes = int( CommonUtil.get_interval_seconds(current_price_time, pre_price_time) / 60) influence_feature_vector = [0.0] * feature_size # 对两个价格之间的每个采样点计算新闻的影响 is_influenced_price = False for minute_i in range(0, last_interval_minutes): # 计算的时刻点,pre_price_time之后的时刻点,包括current_price_time time_i = CommonUtil.get_minute_changed(pre_price_time, minute_i + 1) # 该时刻点受到影响对应的新闻 for news_feature_begin_index in range(last_news_begin, len(newsFeatureList)): interval_seconds = CommonUtil.get_interval_seconds( time_i, CommonUtil.get_datetime_from_string( newsFeatureList[news_feature_begin_index][0])) # 如果有新闻在影响范围内 if 0 <= interval_seconds <= NEWS_INFLUENCE_DACAY_THRESHOLD * 60: for news_feature_end_index in range( news_feature_begin_index, len(newsFeatureList)): if CommonUtil.get_datetime_from_string(newsFeatureList[news_feature_end_index][0]) \ > time_i: break str_begin_end = str(minute_i + 1) + ': news->' + str( news_feature_begin_index) + ' : ' + str( news_feature_end_index - 1) logger.debug(str_begin_end) for news_feature_index in range( news_feature_begin_index, news_feature_end_index): current_news_feature = newsFeatureList[ news_feature_index] influence_score = decay_influence( CommonUtil.get_datetime_from_string( current_news_feature[0]), time_i) for value_i in range(0, feature_size): influence_feature_vector[value_i] += float(current_news_feature[value_i + 1]) \ * influence_score is_influenced_price = True break elif interval_seconds < 0: break last_news_begin = news_feature_begin_index if is_influenced_price: influence_feature_vector.append(price_delta) featureVectorList.append(influence_feature_vector) pre_price_item = current_price_item file_path = FEATURE_VECTOR_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, featureVectorList) logger.info("Generate Feature Vector Done!")