def decay_influence(dt_news_time, dt_current_time): delta_seconds = CommonUtil.get_interval_seconds(dt_current_time, dt_news_time) if delta_seconds > NEWS_INFLUENCE_DACAY_THRESHOLD * 60: return 0 # 0-60秒 if delta_seconds <= NEWS_INFLUENCE_MOST * 60: influence_score = 1 / (NEWS_INFLUENCE_MOST * 60) * delta_seconds else: influence_score = NEWS_INFLUENCE_DACAY_THRESHOLD/(NEWS_INFLUENCE_DACAY_THRESHOLD - NEWS_INFLUENCE_MOST) \ - 1/(NEWS_INFLUENCE_DACAY_THRESHOLD - NEWS_INFLUENCE_MOST) * delta_seconds / 60 return round(influence_score * FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION)
def process_original_news_vec(seg_news_vec, st, et): MARKET_OPEN_TIME = '09:30:00' MARKET_CLOSE_TIME = '23:30:00' NEWS_SAMPLE_MINUTE = 60 processedNewsList = [] # seg_news_vec = CommonUtil.read_csv('./doc/1209/news_sentivalue.csv') NEWS_START_TIME = st NEWS_END_TIME = et sample_datetime = None sample_news_list = [] # 对每一个原始价格 for original_vec in seg_news_vec: news_datetime = CommonUtil.get_datetime_from_string_(original_vec[0]) # print(news_datetime) news_vec = original_vec[1::] if sample_datetime is None: sample_datetime = CommonUtil.get_datetime_from_string_( NEWS_START_TIME) time_interval = CommonUtil.get_interval_seconds( news_datetime, sample_datetime) # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格 if time_interval < -NEWS_SAMPLE_MINUTE * 60 / 2: continue # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点 while time_interval >= NEWS_SAMPLE_MINUTE * 60 / 2: # 如果当前采样点有价格 if len(sample_news_list) > 0: vec_sum = {} for news_item in sample_news_list: # print("news_item:", news_item, "vec_sum:", vec_sum) if news_item[0] in vec_sum.keys(): vec_sum[news_item[0]] += float(news_item[1]) else: vec_sum[news_item[0]] = float(news_item[1]) news_sentiment_list = dict_to_vec(vec_sum) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_news_item = [sample_datetime_str] + news_sentiment_list # 将采样时间及对应的计算后的价格加入列表 processedNewsList.append(average_news_item) # 重置采样点价格列表 sample_news_list = [] # 计算下一个采样点 sample_datetime = CommonUtil.get_next_sample_time( sample_datetime, NEWS_SAMPLE_MINUTE, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) time_interval = CommonUtil.get_interval_seconds( news_datetime, sample_datetime) # 价格时间在采集区间外 if sample_datetime > CommonUtil.get_datetime_from_string_( NEWS_END_TIME): break # 属于当前采样点,加入当前采样点价格列表,前闭后开[,) sample_news_list.append(news_vec) # 处理最后一个采集时刻的价格列表 # 如果当前采样点有价格 if len(sample_news_list) > 0: vec_sum = {} for news_item in sample_news_list: if news_item[0] in vec_sum.keys(): vec_sum[news_item[0]] += float(news_item[1]) else: vec_sum[news_item[0]] = float(news_item[1]) news_sentiment_list = dict_to_vec(vec_sum) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_news_item = [sample_datetime_str] + news_sentiment_list # 将采样时间及对应的计算后的价格加入列表 processedNewsList.append(average_news_item) # file_path = PROCESSED_NEWS_PATH + '_' + str(NEWS_SAMPLE_MINUTE) + CSV_FILE_SUFFIX # CommonUtil.write_csv(file_path, processedNewsList) # 89个维度对应数据 # print("processedNewsList:", processedNewsList) list_out = dimension89(processedNewsList) return list_out
def process_original_price(): logger.info("In Process Original Price...") global originalPriceList originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH) sample_datetime = None sample_price_list = list() # 对每一个原始价格 for original_price in originalPriceList: logger.debug('price time: ' + original_price[0]) price_datetime = CommonUtil.get_datetime_from_string(original_price[0]) price_value = float(original_price[1]) if sample_datetime is None: sample_datetime = CommonUtil.get_datetime_from_string( PRICE_START_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格 if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2: continue # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点 while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2: # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) # 重置采样点价格列表 sample_price_list = list() # 计算下一个采样点 sample_datetime = CommonUtil.get_next_sample_time( sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) logger.debug('sample datetime:' + CommonUtil.get_string_from_datetime(sample_datetime)) # 价格时间在采集区间外 if sample_datetime > CommonUtil.get_datetime_from_string( PRICE_END_TIME): break # 属于当前采样点,加入当前采样点价格列表,前闭后开[,) sample_price_list.append(price_value) # 处理最后一个采集时刻的价格列表 # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) file_path = PROCESSED_PRICE_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, processedPriceList) logger.info("Process Original Price Done!")
def generate_feature_vector(): logger.info("In Generate Feature Vector...") prepare_feature() # 设置标题 title_list = list(featureDict.keys()) title_list.append('TARGET') featureVectorList.append(title_list) feature_size = len(featureDict.keys()) global newsFeatureList newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH) global processedPriceList file_path = PROCESSED_PRICE_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX processedPriceList = CommonUtil.read_csv(file_path) # 新闻从20160630开始到20171229,价格从20160701开始到20171229 last_news_begin = 0 news_feature_begin_index = last_news_begin pre_price_item = list() pre_price_item.append(PRICE_START_TIME) pre_price_item.append(0) price_start_time = CommonUtil.get_datetime_from_string(PRICE_START_TIME) price_end_time = CommonUtil.get_datetime_from_string(PRICE_END_TIME) # 将闭市时间内的新闻统一设置为开市前NEWS_INFLUENCE_MOST分钟时发生的 for news_index in range(0, len(newsFeatureList)): news_feature = newsFeatureList[news_index] news_time = news_feature[0] # 重设新闻时间 news_feature[0] = CommonUtil.\ reset_news_time(news_time, NEWS_INFLUENCE_MOST, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) newsFeatureList[news_index] = news_feature for current_price_item in processedPriceList: current_price_time = CommonUtil.get_datetime_from_string( current_price_item[0]) if price_start_time <= current_price_time < price_end_time: # 计算价格的变化 price_delta = round( (float(current_price_item[1]) - float(pre_price_item[1])) * FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION) pre_price_time = CommonUtil.get_datetime_from_string( pre_price_item[0]) logger.debug(current_price_time) # 计算pre_price_time到current_price_time新闻的作用总和 # last_interval_minutes >= 1 last_interval_minutes = int( CommonUtil.get_interval_seconds(current_price_time, pre_price_time) / 60) influence_feature_vector = [0.0] * feature_size # 对两个价格之间的每个采样点计算新闻的影响 is_influenced_price = False for minute_i in range(0, last_interval_minutes): # 计算的时刻点,pre_price_time之后的时刻点,包括current_price_time time_i = CommonUtil.get_minute_changed(pre_price_time, minute_i + 1) # 该时刻点受到影响对应的新闻 for news_feature_begin_index in range(last_news_begin, len(newsFeatureList)): interval_seconds = CommonUtil.get_interval_seconds( time_i, CommonUtil.get_datetime_from_string( newsFeatureList[news_feature_begin_index][0])) # 如果有新闻在影响范围内 if 0 <= interval_seconds <= NEWS_INFLUENCE_DACAY_THRESHOLD * 60: for news_feature_end_index in range( news_feature_begin_index, len(newsFeatureList)): if CommonUtil.get_datetime_from_string(newsFeatureList[news_feature_end_index][0]) \ > time_i: break str_begin_end = str(minute_i + 1) + ': news->' + str( news_feature_begin_index) + ' : ' + str( news_feature_end_index - 1) logger.debug(str_begin_end) for news_feature_index in range( news_feature_begin_index, news_feature_end_index): current_news_feature = newsFeatureList[ news_feature_index] influence_score = decay_influence( CommonUtil.get_datetime_from_string( current_news_feature[0]), time_i) for value_i in range(0, feature_size): influence_feature_vector[value_i] += float(current_news_feature[value_i + 1]) \ * influence_score is_influenced_price = True break elif interval_seconds < 0: break last_news_begin = news_feature_begin_index if is_influenced_price: influence_feature_vector.append(price_delta) featureVectorList.append(influence_feature_vector) pre_price_item = current_price_item file_path = FEATURE_VECTOR_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, featureVectorList) logger.info("Generate Feature Vector Done!")
def process_original_price(originalPriceList, PRICE_START_TIME, PRICE_END_TIME): # start_time and end_time 要注意时间区域(ORIGINAL_PRICE表格中的起始结束时间) # PRICE_START_TIME = originalPriceList[0][0] # '2016/06/30 09:30:00' # PRICE_END_TIME = originalPriceList[-1][0] # '2017/12/29 23:27:00' PRICE_SAMPLE_MINUTE = 60 CURRENCY_PAIR_PRECISION = 4 # 开市时间 MARKET_OPEN_TIME = '09:30:00' # 闭市时间 MARKET_CLOSE_TIME = '23:30:00' # 预处理后价格列表:[2018/6/30 15:00:00, 6.6433] processedPriceList = list() CSV_FILE_SUFFIX = '.csv' # logger.info("In Process Original Price...") # global originalPriceList # originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH) sample_datetime = None sample_price_list = list() # 对每一个原始价格 for original_price in originalPriceList: #logger.debug('price time: ' + original_price[0]) price_datetime = CommonUtil.get_datetime_from_string_( original_price[0]) # print(original_price[1]) if original_price[1] == '': print('null') price_value = float(original_price[1]) if sample_datetime is None: sample_datetime = CommonUtil.get_datetime_from_string_( PRICE_START_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格 if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2: continue # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点 while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2: # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) # 重置采样点价格列表 sample_price_list = list() # 计算下一个采样点 sample_datetime = CommonUtil.get_next_sample_time( sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) #logger.debug('sample datetime:' + CommonUtil.get_string_from_datetime(sample_datetime)) # 价格时间在采集区间外 if sample_datetime > CommonUtil.get_datetime_from_string_( PRICE_END_TIME): break # 属于当前采样点,加入当前采样点价格列表,前闭后开[,) sample_price_list.append(price_value) # 处理最后一个采集时刻的价格列表 # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) return processedPriceList