def get_feature_value(): feature_vector_list = list() feature_name_list = list() for feature_name in FEATURE_NAME_LIST: feature_name_list.append(feature_name) feature_list = CommonUtil.read_csv(MarketDataCrawler.MARKET_DATA_PATH + '/' + feature_name + '.csv') feature_dict = dict() for feature_i in range(1, len(feature_list)): date = CommonUtil.get_datetime_from_string_( feature_list[feature_i][0]).date() # 开盘价 feature_value = float(feature_list[feature_i][1]) feature_dict[date] = feature_value for value_key in valueDict.keys(): if value_key in feature_dict.keys(): feature_value = feature_dict[value_key] else: feature_value = 'N/A' if value_key in featureValue.keys(): feature_items = featureValue[value_key] else: feature_items = list() feature_items.append(feature_value) featureValue[value_key] = feature_items feature_name_list.append(VALUE_NAME) feature_name_list.insert(0, 'DATE') feature_vector_list.append(feature_name_list) for key in featureValue.keys(): feature_items = featureValue[key] feature_items.append(valueDict[key]) feature_items.insert(0, key) feature_vector_list.append(feature_items) CommonUtil.write_csv('../files/marketdata/FEATURE_VECTOR.csv', feature_vector_list)
def get_value_list(): value_list = CommonUtil.read_csv(MarketDataCrawler.MARKET_DATA_PATH + '/' + VALUE_NAME + '.csv') for value_i in range(1, len(value_list)): date = CommonUtil.get_datetime_from_string_( value_list[value_i][0]).date() # 收盘价 value = float(value_list[value_i][2]) valueDict[date] = value
def process_original_news_vec(seg_news_vec, st, et): MARKET_OPEN_TIME = '09:30:00' MARKET_CLOSE_TIME = '23:30:00' NEWS_SAMPLE_MINUTE = 60 processedNewsList = [] # seg_news_vec = CommonUtil.read_csv('./doc/1209/news_sentivalue.csv') NEWS_START_TIME = st NEWS_END_TIME = et sample_datetime = None sample_news_list = [] # 对每一个原始价格 for original_vec in seg_news_vec: news_datetime = CommonUtil.get_datetime_from_string_(original_vec[0]) # print(news_datetime) news_vec = original_vec[1::] if sample_datetime is None: sample_datetime = CommonUtil.get_datetime_from_string_( NEWS_START_TIME) time_interval = CommonUtil.get_interval_seconds( news_datetime, sample_datetime) # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格 if time_interval < -NEWS_SAMPLE_MINUTE * 60 / 2: continue # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点 while time_interval >= NEWS_SAMPLE_MINUTE * 60 / 2: # 如果当前采样点有价格 if len(sample_news_list) > 0: vec_sum = {} for news_item in sample_news_list: # print("news_item:", news_item, "vec_sum:", vec_sum) if news_item[0] in vec_sum.keys(): vec_sum[news_item[0]] += float(news_item[1]) else: vec_sum[news_item[0]] = float(news_item[1]) news_sentiment_list = dict_to_vec(vec_sum) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_news_item = [sample_datetime_str] + news_sentiment_list # 将采样时间及对应的计算后的价格加入列表 processedNewsList.append(average_news_item) # 重置采样点价格列表 sample_news_list = [] # 计算下一个采样点 sample_datetime = CommonUtil.get_next_sample_time( sample_datetime, NEWS_SAMPLE_MINUTE, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) time_interval = CommonUtil.get_interval_seconds( news_datetime, sample_datetime) # 价格时间在采集区间外 if sample_datetime > CommonUtil.get_datetime_from_string_( NEWS_END_TIME): break # 属于当前采样点,加入当前采样点价格列表,前闭后开[,) sample_news_list.append(news_vec) # 处理最后一个采集时刻的价格列表 # 如果当前采样点有价格 if len(sample_news_list) > 0: vec_sum = {} for news_item in sample_news_list: if news_item[0] in vec_sum.keys(): vec_sum[news_item[0]] += float(news_item[1]) else: vec_sum[news_item[0]] = float(news_item[1]) news_sentiment_list = dict_to_vec(vec_sum) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_news_item = [sample_datetime_str] + news_sentiment_list # 将采样时间及对应的计算后的价格加入列表 processedNewsList.append(average_news_item) # file_path = PROCESSED_NEWS_PATH + '_' + str(NEWS_SAMPLE_MINUTE) + CSV_FILE_SUFFIX # CommonUtil.write_csv(file_path, processedNewsList) # 89个维度对应数据 # print("processedNewsList:", processedNewsList) list_out = dimension89(processedNewsList) return list_out
def process_original_price(originalPriceList, PRICE_START_TIME, PRICE_END_TIME): # start_time and end_time 要注意时间区域(ORIGINAL_PRICE表格中的起始结束时间) # PRICE_START_TIME = originalPriceList[0][0] # '2016/06/30 09:30:00' # PRICE_END_TIME = originalPriceList[-1][0] # '2017/12/29 23:27:00' PRICE_SAMPLE_MINUTE = 60 CURRENCY_PAIR_PRECISION = 4 # 开市时间 MARKET_OPEN_TIME = '09:30:00' # 闭市时间 MARKET_CLOSE_TIME = '23:30:00' # 预处理后价格列表:[2018/6/30 15:00:00, 6.6433] processedPriceList = list() CSV_FILE_SUFFIX = '.csv' # logger.info("In Process Original Price...") # global originalPriceList # originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH) sample_datetime = None sample_price_list = list() # 对每一个原始价格 for original_price in originalPriceList: #logger.debug('price time: ' + original_price[0]) price_datetime = CommonUtil.get_datetime_from_string_( original_price[0]) # print(original_price[1]) if original_price[1] == '': print('null') price_value = float(original_price[1]) if sample_datetime is None: sample_datetime = CommonUtil.get_datetime_from_string_( PRICE_START_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格 if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2: continue # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点 while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2: # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) # 重置采样点价格列表 sample_price_list = list() # 计算下一个采样点 sample_datetime = CommonUtil.get_next_sample_time( sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) #logger.debug('sample datetime:' + CommonUtil.get_string_from_datetime(sample_datetime)) # 价格时间在采集区间外 if sample_datetime > CommonUtil.get_datetime_from_string_( PRICE_END_TIME): break # 属于当前采样点,加入当前采样点价格列表,前闭后开[,) sample_price_list.append(price_value) # 处理最后一个采集时刻的价格列表 # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) return processedPriceList