def up_down_num_statistics(date):
    shindex_seq = []
    tmp = iohelper.read_pickle2objects(date, 'shindex_seq')
    if len(tmp) == 50:
        tmp.pop(25)
        tmp.pop(0)
    tmp = [float(index) for index in tmp]
    for i in range(len(tmp)):
        if i == 0:
            shindex_seq.append(tmp[i] - 2979.4343)   # 17 : 2870.43; 18 : 2904.8319; 23 : 2999.3628  28 : 2979.4343
        else:
            shindex_seq.append(tmp[i] - tmp[i - 1])

    saindex_seq = iohelper.read_pickle2objects(date, 'saindex_seq')
    if len(saindex_seq) == 50:
        saindex_seq.pop(25)
        saindex_seq.pop(0)
    saindex_seq = [float(index) for index in saindex_seq]

    count = 0
    for i in range(len(shindex_seq)):
        if (shindex_seq[i] > 0 and saindex_seq[i] > 0) or (shindex_seq[i] < 0 and saindex_seq[i] < 0) or (shindex_seq[i] == 0 and saindex_seq[i] == 0):
            count += 1
    print('up down common numbers : %d' % count)
    return count
def read_data():
    '''
    read closing data from files
    '''
    global date_of_march, date_of_april, date_of_may, date_of_june

    dir_list = []
    dir_list.extend(date_of_march)
    dir_list.extend(date_of_april)
    dir_list.extend(date_of_may)

    # 训练集
    original_train_data = [[], []]
    for subdir in dir_list:
        # 股指数据
        train_seq = iohelper.read_pickle2objects(subdir, 'shindex_seq')
        original_train_data[0].extend(map(float, train_seq))
        # 情感数据
        train_seq = iohelper.read_pickle2objects(subdir, 'saindex_seq')
        original_train_data[1].extend(map(float, train_seq))

    picture(original_train_data[0])
    assert 1 == 0

    # 测试集
    original_test_data = [[], []]
    for subdir in date_of_june:
        # 股指数据
        test_seq = iohelper.read_pickle2objects(subdir, 'shindex_seq')
        original_test_data[0].extend(map(float, test_seq))
        # 情感数据
        test_seq = iohelper.read_pickle2objects(subdir, 'saindex_seq')
        original_test_data[1].extend(map(float, test_seq))

    return (original_train_data, original_test_data)
def correlation_analysis(date):
    '''
    correlating shindex_seq with saindex_seq
    return:multi-value(num, coef)
    '''

    # Analysis 1 : up down statistics
    num = up_down_num_statistics(date)

    # Analysis 2 : seq_process - index sum sequence
    shindex_seq = iohelper.read_pickle2objects(date, 'shindex_seq')  # shanghai composite index sequence
    if len(shindex_seq) == 50:
        shindex_seq.pop(25)
        shindex_seq.pop(0)
    shindex_seq = [float(index) for index in shindex_seq]
    saindex_seq = []    # sentiment index sequence
    tmp = iohelper.read_pickle2objects(date, 'saindex_seq')
    if len(tmp) == 50:
        tmp.pop(25)
        tmp.pop(0)
    tmp = [float(index) for index in tmp]
    for i in xrange(len(tmp)):
        saindex_seq.append(sum(tmp[0:i]))

    # time series day
    tick_seq = get_tick_time_series()
    if (len(tick_seq) == 50):
        tick_seq.pop(25)
        tick_seq.pop(0)

    # data normalization_min_max
    # shindex_seq = normalization_min_max(shindex_seq)
    # saindex_seq = normalization_min_max(saindex_seq)
    # data normalization_z-score
    shindex_seq = normalization_zero_mean(shindex_seq)
    saindex_seq = normalization_zero_mean(saindex_seq)

    # tick_seq = tick_seq[1:47]        # two 5-min forward
    # shindex_seq = shindex_seq[1:47]  # two 5-min forward
    # saindex_seq = saindex_seq[0:46]  # two 5-min forward to show the sentiment

    # tick_seq = tick_seq[2:47]        # three 5-min forward
    # shindex_seq = shindex_seq[2:47]  # three 5-min forward
    # saindex_seq = saindex_seq[0:45]  # three 5-min forward to show the sentiment

    print('sh day index : %s %d' % (shindex_seq, len(shindex_seq)))
    print('sa day index : %s %d' % (saindex_seq, len(saindex_seq)))
    print('ti day index : %s %d' % (tick_seq, len(tick_seq)))

    coef = pearson_corr(shindex_seq, saindex_seq)
    print(coef)
    plot_index_and_sentiment(tick_seq, shindex_seq, saindex_seq, date)
    return num, coef, len(tick_seq)
def main():
    FILE = os.curdir
    logging.basicConfig(filename=os.path.join(FILE, 'log.txt'), level=logging.ERROR)
    global g_classifier_name
    print('------------------%s-------------------' % (g_classifier_name))
    # loading postive and negtive sentiment lexicon
    pos_lexicon_dict = {}
    neg_lexicon_dict = {}
    lexicon = iohelper.read_lexicon2dict('hownet-positive.txt')
    pos_lexicon_dict = dict(pos_lexicon_dict, **lexicon)
    lexicon = iohelper.read_lexicon2dict('ntusd-positive.txt')
    pos_lexicon_dict = dict(pos_lexicon_dict, **lexicon)
    lexicon = iohelper.read_lexicon2dict('positive.txt', True)
    pos_lexicon_dict = dict(pos_lexicon_dict, **lexicon)

    lexicon = iohelper.read_lexicon2dict('hownet-negative.txt')
    neg_lexicon_dict = dict(neg_lexicon_dict, **lexicon)
    lexicon = iohelper.read_lexicon2dict('ntusd-negative.txt')
    neg_lexicon_dict = dict(neg_lexicon_dict, **lexicon)
    lexicon = iohelper.read_lexicon2dict('negative.txt', True)
    neg_lexicon_dict = dict(neg_lexicon_dict, **lexicon)

    print ('pos_lexicon_dict length : %d' % len(pos_lexicon_dict))
    print ('neg_lexicon_dict length : %d' % len(neg_lexicon_dict))

    global g_best_words
    g_best_words = iohelper.read_pickle2objects('./Reviews/best_words.pkl')

    print ('---------------Sentiment Index Computation------------------')
    sentiment_index_compute(pos_lexicon_dict, neg_lexicon_dict)
    print ('The End!')
def sentiment_machine_learning(tick_blog_segments, computeType = 'log', isPrint = False):
    '''
    using supervised learning classifier to compute every sentence's sentiment index
    '''
    # 返回5分钟评论的句子的特征集合
    global g_best_words
    tick_features = []
    for comment in tick_blog_segments:
        tmp = dict([(word, True) for word in comment if word in g_best_words])
        tick_features.append(tmp)
    if len(tick_features) == 0:
        return 0.01
    # 读取训练好的分类器
    classifier = iohelper.read_pickle2objects('./Reviews/' + g_classifier_name + '.pkl')
    ret = classifier.prob_classify_many(tick_features)

    pos_list = []
    neg_list = []
    for prob_dict in ret:
        samples = prob_dict.samples()  # 含有积极和消极类别的概率,概率总和始终为1
        for sp in samples:
            if sp == 'pos':
                pos_list.append(prob_dict.prob(sp))
            else:
                neg_list.append(prob_dict.prob(sp))
    tick_value_tmp = compute_by_type(pos_list, neg_list, computeType)
    if isPrint:
        print('FIRST-5-MIN Index : %f' % tick_value_tmp)
    return tick_value_tmp
Exemplo n.º 6
0
def main():
    FILE = os.curdir
    logging.basicConfig(filename=os.path.join(FILE, 'log.txt'),
                        level=logging.ERROR)
    global g_classifier_name
    print('------------------%s-------------------' % (g_classifier_name))
    # loading postive and negtive sentiment lexicon
    pos_lexicon_dict = {}
    neg_lexicon_dict = {}
    lexicon = iohelper.read_lexicon2dict('hownet-positive.txt')
    pos_lexicon_dict = dict(pos_lexicon_dict, **lexicon)
    lexicon = iohelper.read_lexicon2dict('ntusd-positive.txt')
    pos_lexicon_dict = dict(pos_lexicon_dict, **lexicon)
    lexicon = iohelper.read_lexicon2dict('positive.txt', True)
    pos_lexicon_dict = dict(pos_lexicon_dict, **lexicon)

    lexicon = iohelper.read_lexicon2dict('hownet-negative.txt')
    neg_lexicon_dict = dict(neg_lexicon_dict, **lexicon)
    lexicon = iohelper.read_lexicon2dict('ntusd-negative.txt')
    neg_lexicon_dict = dict(neg_lexicon_dict, **lexicon)
    lexicon = iohelper.read_lexicon2dict('negative.txt', True)
    neg_lexicon_dict = dict(neg_lexicon_dict, **lexicon)

    print('pos_lexicon_dict length : %d' % len(pos_lexicon_dict))
    print('neg_lexicon_dict length : %d' % len(neg_lexicon_dict))

    global g_best_words
    g_best_words = iohelper.read_pickle2objects('./Reviews/best_words.pkl')

    print('---------------Sentiment Index Computation------------------')
    sentiment_index_compute(pos_lexicon_dict, neg_lexicon_dict)
    print('The End!')
Exemplo n.º 7
0
def sentiment_machine_learning(tick_blog_segments,
                               computeType='log',
                               isPrint=False):
    '''
    using supervised learning classifier to compute every sentence's sentiment index
    '''
    # 返回5分钟评论的句子的特征集合
    global g_best_words
    tick_features = []
    for comment in tick_blog_segments:
        tmp = dict([(word, True) for word in comment if word in g_best_words])
        tick_features.append(tmp)
    if len(tick_features) == 0:
        return 0.01
    # 读取训练好的分类器
    classifier = iohelper.read_pickle2objects('./Reviews/' +
                                              g_classifier_name + '.pkl')
    ret = classifier.prob_classify_many(tick_features)

    pos_list = []
    neg_list = []
    for prob_dict in ret:
        samples = prob_dict.samples()  # 含有积极和消极类别的概率,概率总和始终为1
        for sp in samples:
            if sp == 'pos':
                pos_list.append(prob_dict.prob(sp))
            else:
                neg_list.append(prob_dict.prob(sp))
    tick_value_tmp = compute_by_type(pos_list, neg_list, computeType)
    if isPrint:
        print('FIRST-5-MIN Index : %f' % tick_value_tmp)
    return tick_value_tmp
def main():
    pos_tk_lst = iohelper.read_pickle2objects('./Reviews/pos_reviews.pkl')
    neg_tk_lst = iohelper.read_pickle2objects('./Reviews/neg_reviews.pkl')
    # 使评论集合随机分布
    shuffle(pos_tk_lst)
    shuffle(neg_tk_lst)
    posWords = list(itertools.chain(*pos_tk_lst))  #把多维数组解链成一维数组
    negWords = list(itertools.chain(*neg_tk_lst))  #同理

    # 二选一(前面是所有词,后面是所有词+双词,基于卡方检验疾进行特征选择)
    # print('1.Word Feature Selection-Chi-sq!')
    # word_scores = create_word_scores(posWords, negWords)
    print('2.Word_Plus_Bigram Feature Selection-Chi-sq!')
    pos_tk_lst = words_plus_bigram(pos_tk_lst)
    neg_tk_lst = words_plus_bigram(neg_tk_lst)
    word_scores = create_word_bigram_scores(posWords, negWords)

    global best_words
    best_words = find_best_words(word_scores, 1500)
    iohelper.save_objects2pickle(best_words, './Reviews/best_words.pkl')

    posFeatures = pos_features(
        pos_tk_lst, best_word_features
    )  # [[{'':True, '':True,...}, 'pos'], [{'':True, '':True,...}, 'neg']]
    negFeatures = neg_features(neg_tk_lst, best_word_features)
    print('POS_FEATURES_LENGTH %d\tNEG_FEATURES_LENGTH %d' %
          (len(posFeatures), len(negFeatures)))
    assert len(posFeatures) == len(negFeatures)
    print('-------------------------------------------------')

    Classifier_Type = [
        'Lexicons', 'LR', 'BernoulliNB', 'MultinomialNB', 'LinearSVC', 'NuSVC'
    ]  # 'SVC' IS CANCELLED
    (pos_lexicon_dict, neg_lexicon_dict) = rp.load_sentiment_lexicon()

    # 10_fold_cross-validation(10折交叉验证)
    cut_size = int(len(posFeatures) * 0.9)
    offset_size = len(posFeatures) - cut_size
    avg_scores = {}
    avg_precision = {}
    avg_recall = {}
    avg_time = {}
    for tp in Classifier_Type:
        avg_scores[tp] = 0.0
        avg_precision[tp] = 0.0
        avg_recall[tp] = 0.0
        avg_time[tp] = 0.0
    posTmp = []
    negTmp = []
    # 比较不同分类器的效果(主要分为基于情感词典的和基于监督式学习的)
    for tp in Classifier_Type:
        precision = 0.0
        recall = 0.0
        score = 0.0
        time = 0.0
        if tp == 'Lexicons':
            posTmp = posFeatures
            negTmp = negFeatures
            posFeatures = pos_tk_lst
            negFeatures = neg_tk_lst

        print('Classifier_Type : %s' % (tp))
        for k in range(1, 11):
            test_list = posFeatures[(k - 1) * offset_size:k *
                                    offset_size] + negFeatures[
                                        (k - 1) * offset_size:k * offset_size]
            if k == 1:
                train_list = posFeatures[k * offset_size:] + negFeatures[
                    k * offset_size:]
            elif k == 10:
                train_list = posFeatures[:(
                    k - 1) * offset_size] + negFeatures[:(k - 1) * offset_size]
            else:
                train_list = posFeatures[:(k - 1) * offset_size] + posFeatures[
                    k * offset_size:] + negFeatures[:(
                        k - 1) * offset_size] + negFeatures[k * offset_size:]

            if tp == 'Lexicons':
                test = test_list
                test_tag = ['pos' for i in range(offset_size)]
                test_tag.extend(['neg' for i in range(offset_size)])
                time, precision, recall, score = sentiment_lexicon_score(
                    pos_lexicon_dict, neg_lexicon_dict, test, test_tag)
            else:
                test, test_tag = zip(
                    *test_list
                )  # 将内部的元素list(dict和string)分解成两类tuple({}, {}, {},...)和('pos', 'pos', 'neg', ...)
                if tp == 'LR':
                    time, precision, recall, score = classifier_score(
                        tp, LogisticRegression(), train_list, test, test_tag)
                elif tp == 'BernoulliNB':
                    time, precision, recall, score = classifier_score(
                        tp, BernoulliNB(), train_list, test, test_tag)
                elif tp == 'MultinomialNB':
                    time, precision, recall, score = classifier_score(
                        tp, MultinomialNB(), train_list, test, test_tag)
                elif tp == 'LinearSVC':
                    time, precision, recall, score = classifier_score(
                        tp, LinearSVC(), train_list, test, test_tag)
                elif tp == 'NuSVC':
                    time, precision, recall, score = classifier_score(
                        tp, NuSVC(probability=True), train_list, test,
                        test_tag)
                elif tp == 'SVC':
                    precision, recall, score = classifier_score(
                        tp,
                        SVC(gamma=0.001,
                            C=100.,
                            kernel='linear',
                            probability=True), train_list, test, test_tag)
            avg_scores[tp] += score
            avg_precision[tp] += precision
            avg_recall[tp] += recall
            avg_time[tp] += time
            print(
                'The precision recall accuracy score and training time is repectively : %f %f %f %f'
                % (precision, recall, score, time))
        if tp == 'Lexicons':
            posFeatures = posTmp
            negFeatures = negTmp
            posTmp = []
            posTmp = []
        print('-------------------------------------------------')
    for tp in Classifier_Type:
        avg_scores[tp] = avg_scores[tp] / 10
        avg_precision[tp] = avg_precision[tp] / 10
        avg_recall[tp] = avg_recall[tp] / 10
        avg_time[tp] = avg_time[tp] / 10
        print ("The %s\'s average precision recall accuracy score and training time is repectively : %.2f %.2f %.2f %.2f" % \
            (tp, avg_precision[tp], avg_recall[tp], avg_scores[tp], avg_time[tp]))
    print("The End!")
def main():
    pos_tk_lst = iohelper.read_pickle2objects('./Reviews/pos_reviews.pkl')
    neg_tk_lst = iohelper.read_pickle2objects('./Reviews/neg_reviews.pkl')
    # 使评论集合随机分布
    shuffle(pos_tk_lst)
    shuffle(neg_tk_lst)
    posWords = list(itertools.chain(*pos_tk_lst)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*neg_tk_lst)) #同理

    # 二选一(前面是所有词,后面是所有词+双词,基于卡方检验疾进行特征选择)
    # print('1.Word Feature Selection-Chi-sq!')
    # word_scores = create_word_scores(posWords, negWords)
    print('2.Word_Plus_Bigram Feature Selection-Chi-sq!')
    pos_tk_lst = words_plus_bigram(pos_tk_lst)
    neg_tk_lst = words_plus_bigram(neg_tk_lst)
    word_scores = create_word_bigram_scores(posWords, negWords)

    global best_words
    best_words = find_best_words(word_scores, 1500)
    iohelper.save_objects2pickle(best_words, './Reviews/best_words.pkl')

    posFeatures = pos_features(pos_tk_lst, best_word_features)   # [[{'':True, '':True,...}, 'pos'], [{'':True, '':True,...}, 'neg']]
    negFeatures = neg_features(neg_tk_lst, best_word_features)
    print('POS_FEATURES_LENGTH %d\tNEG_FEATURES_LENGTH %d' % (len(posFeatures), len(negFeatures)))
    assert len(posFeatures) == len(negFeatures)
    print ('-------------------------------------------------')

    Classifier_Type = ['Lexicons', 'LR', 'BernoulliNB', 'MultinomialNB', 'LinearSVC', 'NuSVC']      # 'SVC' IS CANCELLED
    (pos_lexicon_dict, neg_lexicon_dict) = rp.load_sentiment_lexicon()

    # 10_fold_cross-validation(10折交叉验证)
    cut_size = int(len(posFeatures) * 0.9)
    offset_size = len(posFeatures) - cut_size
    avg_scores = {}
    avg_precision = {}
    avg_recall = {}
    avg_time = {}
    for tp in Classifier_Type:
        avg_scores[tp] = 0.0
        avg_precision[tp] = 0.0
        avg_recall[tp] = 0.0
        avg_time[tp] = 0.0
    posTmp = []
    negTmp = []
    # 比较不同分类器的效果(主要分为基于情感词典的和基于监督式学习的)
    for tp in Classifier_Type:
        precision = 0.0
        recall = 0.0
        score = 0.0
        time = 0.0
        if tp == 'Lexicons':
            posTmp = posFeatures
            negTmp = negFeatures
            posFeatures = pos_tk_lst
            negFeatures = neg_tk_lst

        print ('Classifier_Type : %s' % (tp))
        for k in range(1, 11):
            test_list = posFeatures[(k-1)*offset_size:k*offset_size] + negFeatures[(k-1)*offset_size:k*offset_size]
            if k == 1:
                train_list = posFeatures[k*offset_size:] + negFeatures[k*offset_size:]
            elif k == 10:
                train_list = posFeatures[:(k-1)*offset_size] +  negFeatures[:(k-1)*offset_size]
            else:
                train_list = posFeatures[:(k-1)*offset_size] + posFeatures[k*offset_size:] + negFeatures[:(k-1)*offset_size] + negFeatures[k*offset_size:]

            if tp == 'Lexicons':
                test = test_list
                test_tag = ['pos' for i in range(offset_size)]
                test_tag.extend(['neg' for i in range(offset_size)])
                time, precision, recall, score = sentiment_lexicon_score(pos_lexicon_dict, neg_lexicon_dict, test, test_tag)
            else:
                test, test_tag = zip(*test_list)  # 将内部的元素list(dict和string)分解成两类tuple({}, {}, {},...)和('pos', 'pos', 'neg', ...)
                if tp == 'LR':
                    time, precision, recall, score = classifier_score(tp, LogisticRegression(), train_list, test, test_tag)
                elif tp == 'BernoulliNB':
                    time, precision, recall, score = classifier_score(tp, BernoulliNB(), train_list, test, test_tag)
                elif tp == 'MultinomialNB':
                    time, precision, recall, score = classifier_score(tp, MultinomialNB(), train_list, test, test_tag)
                elif tp == 'LinearSVC':
                    time, precision, recall, score = classifier_score(tp, LinearSVC(), train_list, test, test_tag)
                elif tp == 'NuSVC':
                    time, precision, recall, score = classifier_score(tp, NuSVC(probability=True), train_list, test, test_tag)
                elif tp == 'SVC':
                    precision, recall, score = classifier_score(tp, SVC(gamma=0.001, C=100., kernel='linear', probability=True), train_list, test, test_tag)
            avg_scores[tp] += score
            avg_precision[tp] += precision
            avg_recall[tp] += recall
            avg_time[tp] += time
            print ('The precision recall accuracy score and training time is repectively : %f %f %f %f' % (precision, recall, score, time))
        if tp == 'Lexicons':
            posFeatures = posTmp
            negFeatures = negTmp
            posTmp = []
            posTmp = []
        print ('-------------------------------------------------')
    for tp in Classifier_Type:
        avg_scores[tp] = avg_scores[tp] / 10
        avg_precision[tp] = avg_precision[tp] / 10
        avg_recall[tp] = avg_recall[tp] / 10
        avg_time[tp] = avg_time[tp] / 10
        print ("The %s\'s average precision recall accuracy score and training time is repectively : %.2f %.2f %.2f %.2f" % \
            (tp, avg_precision[tp], avg_recall[tp], avg_scores[tp], avg_time[tp]))
    print ("The End!")