示例#1
0
def decay_influence(dt_news_time, dt_current_time):
    delta_seconds = CommonUtil.get_interval_seconds(dt_current_time,
                                                    dt_news_time)
    if delta_seconds > NEWS_INFLUENCE_DACAY_THRESHOLD * 60:
        return 0
    # 0-60秒
    if delta_seconds <= NEWS_INFLUENCE_MOST * 60:
        influence_score = 1 / (NEWS_INFLUENCE_MOST * 60) * delta_seconds
    else:
        influence_score = NEWS_INFLUENCE_DACAY_THRESHOLD/(NEWS_INFLUENCE_DACAY_THRESHOLD - NEWS_INFLUENCE_MOST) \
                          - 1/(NEWS_INFLUENCE_DACAY_THRESHOLD - NEWS_INFLUENCE_MOST) * delta_seconds / 60
    return round(influence_score * FEATURE_VECTOR_SCALE,
                 CURRENCY_PAIR_PRECISION)
示例#2
0
def process_original_news_vec(seg_news_vec, st, et):
    MARKET_OPEN_TIME = '09:30:00'
    MARKET_CLOSE_TIME = '23:30:00'
    NEWS_SAMPLE_MINUTE = 60
    processedNewsList = []
    # seg_news_vec = CommonUtil.read_csv('./doc/1209/news_sentivalue.csv')
    NEWS_START_TIME = st
    NEWS_END_TIME = et
    sample_datetime = None
    sample_news_list = []
    # 对每一个原始价格
    for original_vec in seg_news_vec:
        news_datetime = CommonUtil.get_datetime_from_string_(original_vec[0])
        # print(news_datetime)
        news_vec = original_vec[1::]
        if sample_datetime is None:
            sample_datetime = CommonUtil.get_datetime_from_string_(
                NEWS_START_TIME)
        time_interval = CommonUtil.get_interval_seconds(
            news_datetime, sample_datetime)
        # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格
        if time_interval < -NEWS_SAMPLE_MINUTE * 60 / 2:
            continue
        # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点
        while time_interval >= NEWS_SAMPLE_MINUTE * 60 / 2:
            # 如果当前采样点有价格
            if len(sample_news_list) > 0:
                vec_sum = {}
                for news_item in sample_news_list:
                    # print("news_item:", news_item, "vec_sum:", vec_sum)
                    if news_item[0] in vec_sum.keys():
                        vec_sum[news_item[0]] += float(news_item[1])
                    else:
                        vec_sum[news_item[0]] = float(news_item[1])
                news_sentiment_list = dict_to_vec(vec_sum)
                sample_datetime_str = CommonUtil.get_string_from_datetime(
                    sample_datetime)
                average_news_item = [sample_datetime_str] + news_sentiment_list
                # 将采样时间及对应的计算后的价格加入列表
                processedNewsList.append(average_news_item)
                # 重置采样点价格列表
                sample_news_list = []
            # 计算下一个采样点
            sample_datetime = CommonUtil.get_next_sample_time(
                sample_datetime, NEWS_SAMPLE_MINUTE, MARKET_OPEN_TIME,
                MARKET_CLOSE_TIME)
            time_interval = CommonUtil.get_interval_seconds(
                news_datetime, sample_datetime)
        # 价格时间在采集区间外
        if sample_datetime > CommonUtil.get_datetime_from_string_(
                NEWS_END_TIME):
            break
        # 属于当前采样点,加入当前采样点价格列表,前闭后开[,)
        sample_news_list.append(news_vec)
    # 处理最后一个采集时刻的价格列表
    # 如果当前采样点有价格
    if len(sample_news_list) > 0:
        vec_sum = {}
        for news_item in sample_news_list:
            if news_item[0] in vec_sum.keys():
                vec_sum[news_item[0]] += float(news_item[1])
            else:
                vec_sum[news_item[0]] = float(news_item[1])
        news_sentiment_list = dict_to_vec(vec_sum)
        sample_datetime_str = CommonUtil.get_string_from_datetime(
            sample_datetime)
        average_news_item = [sample_datetime_str] + news_sentiment_list
        # 将采样时间及对应的计算后的价格加入列表
        processedNewsList.append(average_news_item)
    # file_path = PROCESSED_NEWS_PATH + '_' + str(NEWS_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    # CommonUtil.write_csv(file_path, processedNewsList)
    # 89个维度对应数据
    # print("processedNewsList:", processedNewsList)
    list_out = dimension89(processedNewsList)
    return list_out
示例#3
0
def process_original_price():
    logger.info("In Process Original Price...")
    global originalPriceList
    originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH)
    sample_datetime = None
    sample_price_list = list()
    # 对每一个原始价格
    for original_price in originalPriceList:
        logger.debug('price time: ' + original_price[0])
        price_datetime = CommonUtil.get_datetime_from_string(original_price[0])
        price_value = float(original_price[1])
        if sample_datetime is None:
            sample_datetime = CommonUtil.get_datetime_from_string(
                PRICE_START_TIME)
        time_interval = CommonUtil.get_interval_seconds(
            price_datetime, sample_datetime)
        # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格
        if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2:
            continue
        # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点
        while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2:
            # 如果当前采样点有价格
            if len(sample_price_list) > 0:
                price_sum = 0
                for price_item in sample_price_list:
                    price_sum += price_item
                average_price = round(price_sum / len(sample_price_list),
                                      CURRENCY_PAIR_PRECISION + 2)
                sample_datetime_str = CommonUtil.get_string_from_datetime(
                    sample_datetime)
                average_price_item = [sample_datetime_str, average_price]
                # 将采样时间及对应的计算后的价格加入列表
                processedPriceList.append(average_price_item)
                # 重置采样点价格列表
                sample_price_list = list()
            # 计算下一个采样点
            sample_datetime = CommonUtil.get_next_sample_time(
                sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME,
                MARKET_CLOSE_TIME)
            time_interval = CommonUtil.get_interval_seconds(
                price_datetime, sample_datetime)
        logger.debug('sample datetime:' +
                     CommonUtil.get_string_from_datetime(sample_datetime))
        # 价格时间在采集区间外
        if sample_datetime > CommonUtil.get_datetime_from_string(
                PRICE_END_TIME):
            break
        # 属于当前采样点,加入当前采样点价格列表,前闭后开[,)
        sample_price_list.append(price_value)
    # 处理最后一个采集时刻的价格列表
    # 如果当前采样点有价格
    if len(sample_price_list) > 0:
        price_sum = 0
        for price_item in sample_price_list:
            price_sum += price_item
        average_price = round(price_sum / len(sample_price_list),
                              CURRENCY_PAIR_PRECISION + 2)
        sample_datetime_str = CommonUtil.get_string_from_datetime(
            sample_datetime)
        average_price_item = [sample_datetime_str, average_price]
        # 将采样时间及对应的计算后的价格加入列表
        processedPriceList.append(average_price_item)
    file_path = PROCESSED_PRICE_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, processedPriceList)
    logger.info("Process Original Price Done!")
示例#4
0
def generate_feature_vector():
    logger.info("In Generate Feature Vector...")
    prepare_feature()
    # 设置标题
    title_list = list(featureDict.keys())
    title_list.append('TARGET')
    featureVectorList.append(title_list)
    feature_size = len(featureDict.keys())
    global newsFeatureList
    newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH)
    global processedPriceList
    file_path = PROCESSED_PRICE_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    processedPriceList = CommonUtil.read_csv(file_path)
    # 新闻从20160630开始到20171229,价格从20160701开始到20171229
    last_news_begin = 0
    news_feature_begin_index = last_news_begin
    pre_price_item = list()
    pre_price_item.append(PRICE_START_TIME)
    pre_price_item.append(0)
    price_start_time = CommonUtil.get_datetime_from_string(PRICE_START_TIME)
    price_end_time = CommonUtil.get_datetime_from_string(PRICE_END_TIME)
    # 将闭市时间内的新闻统一设置为开市前NEWS_INFLUENCE_MOST分钟时发生的
    for news_index in range(0, len(newsFeatureList)):
        news_feature = newsFeatureList[news_index]
        news_time = news_feature[0]
        # 重设新闻时间
        news_feature[0] = CommonUtil.\
            reset_news_time(news_time, NEWS_INFLUENCE_MOST, MARKET_OPEN_TIME, MARKET_CLOSE_TIME)
        newsFeatureList[news_index] = news_feature
    for current_price_item in processedPriceList:
        current_price_time = CommonUtil.get_datetime_from_string(
            current_price_item[0])
        if price_start_time <= current_price_time < price_end_time:
            # 计算价格的变化
            price_delta = round(
                (float(current_price_item[1]) - float(pre_price_item[1])) *
                FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION)
            pre_price_time = CommonUtil.get_datetime_from_string(
                pre_price_item[0])
            logger.debug(current_price_time)
            # 计算pre_price_time到current_price_time新闻的作用总和
            # last_interval_minutes >= 1
            last_interval_minutes = int(
                CommonUtil.get_interval_seconds(current_price_time,
                                                pre_price_time) / 60)
            influence_feature_vector = [0.0] * feature_size
            # 对两个价格之间的每个采样点计算新闻的影响
            is_influenced_price = False
            for minute_i in range(0, last_interval_minutes):
                # 计算的时刻点,pre_price_time之后的时刻点,包括current_price_time
                time_i = CommonUtil.get_minute_changed(pre_price_time,
                                                       minute_i + 1)
                # 该时刻点受到影响对应的新闻
                for news_feature_begin_index in range(last_news_begin,
                                                      len(newsFeatureList)):
                    interval_seconds = CommonUtil.get_interval_seconds(
                        time_i,
                        CommonUtil.get_datetime_from_string(
                            newsFeatureList[news_feature_begin_index][0]))
                    # 如果有新闻在影响范围内
                    if 0 <= interval_seconds <= NEWS_INFLUENCE_DACAY_THRESHOLD * 60:
                        for news_feature_end_index in range(
                                news_feature_begin_index,
                                len(newsFeatureList)):
                            if CommonUtil.get_datetime_from_string(newsFeatureList[news_feature_end_index][0]) \
                                    > time_i:
                                break
                        str_begin_end = str(minute_i + 1) + ': news->' + str(
                            news_feature_begin_index) + ' : ' + str(
                                news_feature_end_index - 1)
                        logger.debug(str_begin_end)
                        for news_feature_index in range(
                                news_feature_begin_index,
                                news_feature_end_index):
                            current_news_feature = newsFeatureList[
                                news_feature_index]
                            influence_score = decay_influence(
                                CommonUtil.get_datetime_from_string(
                                    current_news_feature[0]), time_i)
                            for value_i in range(0, feature_size):
                                influence_feature_vector[value_i] += float(current_news_feature[value_i + 1]) \
                                                                     * influence_score
                        is_influenced_price = True
                        break
                    elif interval_seconds < 0:
                        break
                last_news_begin = news_feature_begin_index
            if is_influenced_price:
                influence_feature_vector.append(price_delta)
                featureVectorList.append(influence_feature_vector)
        pre_price_item = current_price_item
    file_path = FEATURE_VECTOR_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, featureVectorList)
    logger.info("Generate Feature Vector Done!")
示例#5
0
def process_original_price(originalPriceList, PRICE_START_TIME,
                           PRICE_END_TIME):
    # start_time and end_time 要注意时间区域(ORIGINAL_PRICE表格中的起始结束时间)
    # PRICE_START_TIME = originalPriceList[0][0]  #  '2016/06/30  09:30:00'
    # PRICE_END_TIME = originalPriceList[-1][0]  #  '2017/12/29  23:27:00'
    PRICE_SAMPLE_MINUTE = 60
    CURRENCY_PAIR_PRECISION = 4
    # 开市时间
    MARKET_OPEN_TIME = '09:30:00'
    # 闭市时间
    MARKET_CLOSE_TIME = '23:30:00'
    # 预处理后价格列表:[2018/6/30 15:00:00, 6.6433]
    processedPriceList = list()
    CSV_FILE_SUFFIX = '.csv'

    # logger.info("In Process Original Price...")
    # global originalPriceList
    # originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH)
    sample_datetime = None
    sample_price_list = list()
    # 对每一个原始价格
    for original_price in originalPriceList:
        #logger.debug('price time: ' + original_price[0])
        price_datetime = CommonUtil.get_datetime_from_string_(
            original_price[0])
        # print(original_price[1])
        if original_price[1] == '':
            print('null')
        price_value = float(original_price[1])
        if sample_datetime is None:
            sample_datetime = CommonUtil.get_datetime_from_string_(
                PRICE_START_TIME)
        time_interval = CommonUtil.get_interval_seconds(
            price_datetime, sample_datetime)
        # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格
        if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2:
            continue
        # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点
        while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2:
            # 如果当前采样点有价格
            if len(sample_price_list) > 0:
                price_sum = 0
                for price_item in sample_price_list:
                    price_sum += price_item
                average_price = round(price_sum / len(sample_price_list),
                                      CURRENCY_PAIR_PRECISION + 2)
                sample_datetime_str = CommonUtil.get_string_from_datetime(
                    sample_datetime)
                average_price_item = [sample_datetime_str, average_price]
                # 将采样时间及对应的计算后的价格加入列表
                processedPriceList.append(average_price_item)
                # 重置采样点价格列表
                sample_price_list = list()
            # 计算下一个采样点
            sample_datetime = CommonUtil.get_next_sample_time(
                sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME,
                MARKET_CLOSE_TIME)
            time_interval = CommonUtil.get_interval_seconds(
                price_datetime, sample_datetime)
        #logger.debug('sample datetime:' + CommonUtil.get_string_from_datetime(sample_datetime))
        # 价格时间在采集区间外
        if sample_datetime > CommonUtil.get_datetime_from_string_(
                PRICE_END_TIME):
            break
        # 属于当前采样点,加入当前采样点价格列表,前闭后开[,)
        sample_price_list.append(price_value)
    # 处理最后一个采集时刻的价格列表
    # 如果当前采样点有价格
    if len(sample_price_list) > 0:
        price_sum = 0
        for price_item in sample_price_list:
            price_sum += price_item
        average_price = round(price_sum / len(sample_price_list),
                              CURRENCY_PAIR_PRECISION + 2)
        sample_datetime_str = CommonUtil.get_string_from_datetime(
            sample_datetime)
        average_price_item = [sample_datetime_str, average_price]
        # 将采样时间及对应的计算后的价格加入列表
        processedPriceList.append(average_price_item)
    return processedPriceList