def feature_about(): # 获取特征列表 feature_dict = NewsUtil.get_feature() # 获取新闻中出现特征后最近的5个词及其属性 logger.info("In Prepare Raw News...") raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH) raw_news_table = raw_news_data.sheet_by_index(0) raw_news_rows = raw_news_table.nrows segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, CFETSFX_LEXICON_PATH) # 加载模型,第二个参数是您的外部词典文件路径 feature_about_list = list() for rowN in range(0, raw_news_rows): news_content = raw_news_table.cell_value(rowN, 2) sentences = SentenceSplitter.split(news_content) for sentence in sentences: print(sentence) # 分词 words = segmentor.segment(sentence) print(list(words)) for word_index in range(0, len(words)): word = words[word_index] for feature_word in feature_dict.values(): if feature_word in word: about_list = list() count = 0 while word_index < len(words) and count < 6: about_list.append(words[word_index]) count += 1 word_index += 1 feature_about_list.append(about_list) print(about_list) break segmentor.release() CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
def link_vec(NEWS_VEC, PRICE_VEC, p): ALL_VEC = [] # print(NEWS_VEC[0][1]) # print(PRICE_VEC[0][0]) for each_price_vec in PRICE_VEC: flag = 0 for each_news_vec in NEWS_VEC: # print("each_news_vec[0] , each_price_vec[0]", each_news_vec[0] , each_price_vec[0]) if each_news_vec[0] == each_price_vec[0]: print("matched!") vec_of_all = each_news_vec[::] vec_of_all.append(each_price_vec[1]) vec_of_all.append(each_price_vec[2]) ALL_VEC.append(vec_of_all) flag = 1 break if flag == 0: print("not matched! Created!") list_temp = [] list_temp.append(each_price_vec[0]) list_temp += [0] * (len(each_news_vec) - 1) list_temp.append(each_price_vec[1]) list_temp.append(each_price_vec[2]) ALL_VEC.append(list_temp) CommonUtil.write_csv(p, ALL_VEC)
def get_feature_value(): feature_vector_list = list() feature_name_list = list() for feature_name in FEATURE_NAME_LIST: feature_name_list.append(feature_name) feature_list = CommonUtil.read_csv(MarketDataCrawler.MARKET_DATA_PATH + '/' + feature_name + '.csv') feature_dict = dict() for feature_i in range(1, len(feature_list)): date = CommonUtil.get_datetime_from_string_( feature_list[feature_i][0]).date() # 开盘价 feature_value = float(feature_list[feature_i][1]) feature_dict[date] = feature_value for value_key in valueDict.keys(): if value_key in feature_dict.keys(): feature_value = feature_dict[value_key] else: feature_value = 'N/A' if value_key in featureValue.keys(): feature_items = featureValue[value_key] else: feature_items = list() feature_items.append(feature_value) featureValue[value_key] = feature_items feature_name_list.append(VALUE_NAME) feature_name_list.insert(0, 'DATE') feature_vector_list.append(feature_name_list) for key in featureValue.keys(): feature_items = featureValue[key] feature_items.append(valueDict[key]) feature_items.insert(0, key) feature_vector_list.append(feature_items) CommonUtil.write_csv('../files/marketdata/FEATURE_VECTOR.csv', feature_vector_list)
def news_sentiment(): logger.info("In News Sentiment...") count = 1 for mapped_news in newsMappedList: feature_vector_item = list() news_index = mapped_news[0] news_time = mapped_news[1] feature_vector_item.append(news_index) feature_vector_item.append(news_time) feature_vector = list() keyword_sentiment_dict = dict() # 下标从0开始,减1 news_mapped = newsList[news_index - 1] for mapped_news_index in range(2, len(mapped_news)): keyword = mapped_news[mapped_news_index] sentiment_result = BaiduNLPProcessor.sentiment_classify( news_mapped[2]) keyword_sentiment_dict[keyword] = sentiment_result keys = featureDict.keys() for key in keys: if featureDict[key] in keyword_sentiment_dict.keys(): feature_vector.append(keyword_sentiment_dict[featureDict[key]]) else: feature_vector.append(0) feature_vector_item.append(feature_vector) newsItemList.append(feature_vector_item) feature_vector.insert(0, news_time) newsFeatureList.append(feature_vector) logger.info(count) count += 1 CommonUtil.write_csv(NEWS_ITEM_PATH, newsItemList) CommonUtil.write_csv(NEWS_FEATURE_PATH, newsFeatureList) logger.info("News Sentiment Done!")
def news_segment(): logger.info("In Segment News...") count = 1 for news_item in newsList: word_list = BaiduNLPProcessor.lexer(news_item[2]) word_list.insert(0, news_item[0]) word_list.insert(1, CommonUtil.get_string_from_datetime(news_item[1])) newsSegmentationList.append(word_list) logger.info(count) count += 1 CommonUtil.write_csv(SEGMENTED_NEWS_PATH, newsSegmentationList) logger.info("Segment News...Done!")
def adjust_feature_vector(): feature_vector_list = CommonUtil.read_csv( '../files/marketdata/FEATURE_VECTOR.csv') pre_item = feature_vector_list[0] current_item = pre_item for vector_i in range(1, len(feature_vector_list)): current_item = feature_vector_list[vector_i] for i in range(1, len(current_item)): if current_item[i] == 'N/A': current_item[i] = pre_item[i] feature_vector_list[vector_i] = current_item pre_item = current_item CommonUtil.write_csv('../files/marketdata/ADJUSTED_FEATURE_VECTOR.csv', feature_vector_list)
def reduce_feature_vector(): logger.info("In Reduce Feature Vector...") prepare_feature() origin_feature_num = len(featureDict.keys()) global featureVectorList reduced_feature_vector_list = list() feature_list = list() feature_count_threshold = 2 file_path = FEATURE_VECTOR_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX featureVectorList = CommonUtil.read_csv(file_path) feature_count_dict = dict() feature_count_list = [0] * origin_feature_num is_title = True for feature_vector in featureVectorList: if is_title: is_title = False else: for feature_value_index in range(0, origin_feature_num): if feature_vector[feature_value_index] != '0.0': feature_count_list[feature_value_index] += 1 feature_index = 0 for key in featureDict.keys(): feature_count = feature_count_list[feature_index] feature_count_dict[key] = feature_count if feature_count >= feature_count_threshold: feature_list.append(feature_index) feature_index += 1 logger.info(str('Reduce Feature Vector to: ' + str(len(feature_list)))) feature_list.append(origin_feature_num) # 拼装计数超过阈值的特征向量 for feature_vector in featureVectorList: reduced_feature_vector = list() for feature_value_index in range(0, origin_feature_num + 1): if feature_value_index in feature_list: try: reduced_feature_vector.append( feature_vector[feature_value_index]) except IndexError: logger.error(feature_vector) logger.error(feature_value_index) reduced_feature_vector_list.append(reduced_feature_vector) file_path = REDUCED_FEATURE_VECTOR_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, reduced_feature_vector_list) logger.info("Reduce Feature Vector Done!")
# The coefficients print('回归系数: \n', clf.coef_) print('截距: \n', clf.intercept_) print("Mean Absolute error:", mean_absolute_error(y_test, y_pred)) # The mean squared error print("Mean squared error:", mean_squared_error(y_test, y_pred)) # Explained variance score: 1 is perfect prediction print('模型得分: %.2f', clf.score(X_test, y_test)) print('R2 score: %.2f' % r2_score(y_test, y_pred)) y_pred = clf.predict(X_test) y_pred = y_pred.reshape(len(y_pred), 1) y_test = y_test.reshape(len(y_test), 1) # write_csv('C:/Users/yuzhe/Desktop/OptionAnalysis/files/result.csv', X_test) CommonUtil.write_csv('C:/Users/yuzhe/Desktop/OptionAnalysis/files/y_test.csv', y_test) CommonUtil.write_csv('C:/Users/yuzhe/Desktop/OptionAnalysis/files/y_pred.csv', y_pred) ############################### # 用predic预测,这里预测输入x对应的值,进行画线 X_test = [i for i in range(len(y_test))] plt.scatter(X_test, y_test, color='black') plt.plot(X_test, y_pred, color='blue', linewidth=2) plt.xticks(()) plt.yticks(np.linspace(6.5, 7, 10)) plt.show()
def generate_feature_vector(): logger.info("In Generate Feature Vector...") prepare_feature() # 设置标题 title_list = list(featureDict.keys()) title_list.append('TARGET') featureVectorList.append(title_list) feature_size = len(featureDict.keys()) global newsFeatureList newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH) global processedPriceList file_path = PROCESSED_PRICE_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX processedPriceList = CommonUtil.read_csv(file_path) # 新闻从20160630开始到20171229,价格从20160701开始到20171229 last_news_begin = 0 news_feature_begin_index = last_news_begin pre_price_item = list() pre_price_item.append(PRICE_START_TIME) pre_price_item.append(0) price_start_time = CommonUtil.get_datetime_from_string(PRICE_START_TIME) price_end_time = CommonUtil.get_datetime_from_string(PRICE_END_TIME) # 将闭市时间内的新闻统一设置为开市前NEWS_INFLUENCE_MOST分钟时发生的 for news_index in range(0, len(newsFeatureList)): news_feature = newsFeatureList[news_index] news_time = news_feature[0] # 重设新闻时间 news_feature[0] = CommonUtil.\ reset_news_time(news_time, NEWS_INFLUENCE_MOST, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) newsFeatureList[news_index] = news_feature for current_price_item in processedPriceList: current_price_time = CommonUtil.get_datetime_from_string( current_price_item[0]) if price_start_time <= current_price_time < price_end_time: # 计算价格的变化 price_delta = round( (float(current_price_item[1]) - float(pre_price_item[1])) * FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION) pre_price_time = CommonUtil.get_datetime_from_string( pre_price_item[0]) logger.debug(current_price_time) # 计算pre_price_time到current_price_time新闻的作用总和 # last_interval_minutes >= 1 last_interval_minutes = int( CommonUtil.get_interval_seconds(current_price_time, pre_price_time) / 60) influence_feature_vector = [0.0] * feature_size # 对两个价格之间的每个采样点计算新闻的影响 is_influenced_price = False for minute_i in range(0, last_interval_minutes): # 计算的时刻点,pre_price_time之后的时刻点,包括current_price_time time_i = CommonUtil.get_minute_changed(pre_price_time, minute_i + 1) # 该时刻点受到影响对应的新闻 for news_feature_begin_index in range(last_news_begin, len(newsFeatureList)): interval_seconds = CommonUtil.get_interval_seconds( time_i, CommonUtil.get_datetime_from_string( newsFeatureList[news_feature_begin_index][0])) # 如果有新闻在影响范围内 if 0 <= interval_seconds <= NEWS_INFLUENCE_DACAY_THRESHOLD * 60: for news_feature_end_index in range( news_feature_begin_index, len(newsFeatureList)): if CommonUtil.get_datetime_from_string(newsFeatureList[news_feature_end_index][0]) \ > time_i: break str_begin_end = str(minute_i + 1) + ': news->' + str( news_feature_begin_index) + ' : ' + str( news_feature_end_index - 1) logger.debug(str_begin_end) for news_feature_index in range( news_feature_begin_index, news_feature_end_index): current_news_feature = newsFeatureList[ news_feature_index] influence_score = decay_influence( CommonUtil.get_datetime_from_string( current_news_feature[0]), time_i) for value_i in range(0, feature_size): influence_feature_vector[value_i] += float(current_news_feature[value_i + 1]) \ * influence_score is_influenced_price = True break elif interval_seconds < 0: break last_news_begin = news_feature_begin_index if is_influenced_price: influence_feature_vector.append(price_delta) featureVectorList.append(influence_feature_vector) pre_price_item = current_price_item file_path = FEATURE_VECTOR_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, featureVectorList) logger.info("Generate Feature Vector Done!")
def process_original_price(): logger.info("In Process Original Price...") global originalPriceList originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH) sample_datetime = None sample_price_list = list() # 对每一个原始价格 for original_price in originalPriceList: logger.debug('price time: ' + original_price[0]) price_datetime = CommonUtil.get_datetime_from_string(original_price[0]) price_value = float(original_price[1]) if sample_datetime is None: sample_datetime = CommonUtil.get_datetime_from_string( PRICE_START_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格 if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2: continue # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点 while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2: # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) # 重置采样点价格列表 sample_price_list = list() # 计算下一个采样点 sample_datetime = CommonUtil.get_next_sample_time( sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) logger.debug('sample datetime:' + CommonUtil.get_string_from_datetime(sample_datetime)) # 价格时间在采集区间外 if sample_datetime > CommonUtil.get_datetime_from_string( PRICE_END_TIME): break # 属于当前采样点,加入当前采样点价格列表,前闭后开[,) sample_price_list.append(price_value) # 处理最后一个采集时刻的价格列表 # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) file_path = PROCESSED_PRICE_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, processedPriceList) logger.info("Process Original Price Done!")