Python save_objects2pickle 예제들, iohelper.save_objects2pickle Python 예제들

예제 #1

0

파일 보기

파일: reviews_preprocessing.py 프로젝트: JoshuaMichaelKing/Stock-SentimentAnalysis

def pos_or_neg_reviews2pkl():
    '''
    convert the neg_reviews and pos_reviews to list[[,...], [,...],...]
    save the list to pkl
    '''
    neg_list = iohelper.read_file2list('neg')
    pos_list = iohelper.read_file2list('pos')
    neg_tk_lst = word_tokenization(neg_list)
    pos_tk_lst = word_tokenization(pos_list)    # segmentation : [[,], [,], ...]
    iohelper.save_objects2pickle(neg_tk_lst, './Reviews/neg_reviews.pkl')
    iohelper.save_objects2pickle(pos_tk_lst, './Reviews/pos_reviews.pkl')

예제 #2

0

파일 보기

파일: reviews_preprocessing.py 프로젝트: ma76538/Stock-SentimentAnalysis

def pos_or_neg_reviews2pkl():
    '''
    convert the neg_reviews and pos_reviews to list[[,...], [,...],...]
    save the list to pkl
    '''
    neg_list = iohelper.read_file2list('neg')
    pos_list = iohelper.read_file2list('pos')
    neg_tk_lst = word_tokenization(neg_list)
    pos_tk_lst = word_tokenization(pos_list)  # segmentation : [[,], [,], ...]
    iohelper.save_objects2pickle(neg_tk_lst, './Reviews/neg_reviews.pkl')
    iohelper.save_objects2pickle(pos_tk_lst, './Reviews/pos_reviews.pkl')

예제 #3

0

파일 보기

파일: classifiers_score.py 프로젝트: ma76538/Stock-SentimentAnalysis

def classifier_score(tp, classifier, train_list, test, test_tag):
    '''
    传入分类器进行分类
    Output:pos_precision, pos_recall, accuracy_score
    '''
    starttime = datetime.datetime.now()
    classifier = SklearnClassifier(classifier)
    classifier.train(train_list)
    iohelper.save_objects2pickle(classifier, './Reviews/' + tp + '.pkl')
    pred = classifier.classify_many(test)  # 返回的是结果集的list
    y_true = [1 if tag == 'pos' else 0 for tag in test_tag]
    y_pred = [1 if tag == 'pos' else 0 for tag in pred]
    pos_precision = precision_score(y_true, y_pred)
    pos_recall = recall_score(y_true, y_pred)
    endtime = datetime.datetime.now()
    interval = (endtime - starttime).microseconds
    interval = interval / 100
    return interval, pos_precision, pos_recall, accuracy_score(test_tag, pred)

예제 #4

0

파일 보기

파일: classifiers_score.py 프로젝트: JoshuaMichaelKing/Stock-SentimentAnalysis

def classifier_score(tp, classifier, train_list, test, test_tag):
    '''
    传入分类器进行分类
    Output:pos_precision, pos_recall, accuracy_score
    '''
    starttime = datetime.datetime.now()
    classifier = SklearnClassifier(classifier)
    classifier.train(train_list)
    iohelper.save_objects2pickle(classifier, './Reviews/' + tp + '.pkl')
    pred = classifier.classify_many(test)  # 返回的是结果集的list
    y_true = [1 if tag == 'pos' else 0 for tag in test_tag]
    y_pred = [1 if tag == 'pos' else 0 for tag in pred]
    pos_precision = precision_score(y_true, y_pred)
    pos_recall = recall_score(y_true, y_pred)
    endtime = datetime.datetime.now()
    interval = (endtime - starttime).microseconds
    interval = interval / 100
    return interval, pos_precision, pos_recall, accuracy_score(test_tag, pred)

예제 #5

0

파일 보기

파일: sentiment.py 프로젝트: ma76538/Stock-SentimentAnalysis

def word_preprocessing(blog_corpus, subdir):
    '''
    blog_corpus : [["", ""], ["", ""], ["", ""]], new_blog_corpus is the filtered one
    this function just preprocess the day blog from the saved file(eg:930.txt...) and process and integrate all to one list
    then to compute every word tfidf and extract the feature word using tfidf
    lastly save the top word_tfidf into txt and pickle(20160311/wordDict.txt or 20160311/wordDict.pkl)
    '''
    word_dict = {}
    new_blog_corpus = []
    for doc in blog_corpus:
        tmp = []
        for word in doc:
            if is_word_invalid(word):
                continue
            else:
                tmp.append(word)
                word_dict[word] = 0
        new_blog_corpus.append(tmp)
    print('word_preprocessing-all microblog number and new %d %d' %
          (len(blog_corpus), len(new_blog_corpus)))
    print('word_preprocessing-all word number %d' % len(word_dict))

    word_tfidf_dict = {}
    for doc in new_blog_corpus:
        score_dict = {}
        for term in doc:
            tfidf = tf_idf(term, doc, new_blog_corpus)
            score_dict[term] = tfidf
        score_list = sorted(score_dict.iteritems(),
                            key=lambda d: d[1],
                            reverse=True)
        if (len(score_list) >= 3):
            for cur in range(3):
                word_tfidf_dict[score_list[cur][0]] = score_list[cur][1]
    word_tfidf_list = []
    # word_tfidf_list = sorted(word_tfidf_dict.iteritems(), key=lambda d:d[1], reverse = False)
    for word in word_tfidf_dict:
        tp = []
        tp.append(word)
        tp.append(word_tfidf_dict[word])
        word_tfidf_list.append(tp)
    print('word_preprocessing-all new word number %d' % len(word_tfidf_list))
    iohelper.save_objects2pickle(word_tfidf_list, subdir, 'wordTFDict')
    print('word_preprocessing-save word_list_tfidf success!')

예제 #6

0

파일 보기

파일: sentiment.py 프로젝트: JoshuaMichaelKing/Stock-SentimentAnalysis

def word_preprocessing(blog_corpus, subdir):
    '''
    blog_corpus : [["", ""], ["", ""], ["", ""]], new_blog_corpus is the filtered one
    this function just preprocess the day blog from the saved file(eg:930.txt...) and process and integrate all to one list
    then to compute every word tfidf and extract the feature word using tfidf
    lastly save the top word_tfidf into txt and pickle(20160311/wordDict.txt or 20160311/wordDict.pkl)
    '''
    word_dict = {}
    new_blog_corpus = []
    for doc in blog_corpus:
        tmp = []
        for word in doc:
            if is_word_invalid(word):
                continue
            else:
                tmp.append(word)
                word_dict[word] = 0
        new_blog_corpus.append(tmp)
    print('word_preprocessing-all microblog number and new %d %d' % (len(blog_corpus), len(new_blog_corpus)))
    print('word_preprocessing-all word number %d' % len(word_dict))

    word_tfidf_dict = {}
    for doc in new_blog_corpus:
        score_dict = {}
        for term in doc:
            tfidf = tf_idf(term, doc, new_blog_corpus)
            score_dict[term] = tfidf
        score_list = sorted(score_dict.iteritems(), key=lambda d:d[1], reverse = True)
        if (len(score_list) >= 3):
            for cur in range(3):
                word_tfidf_dict[score_list[cur][0]] = score_list[cur][1]
    word_tfidf_list = []
    # word_tfidf_list = sorted(word_tfidf_dict.iteritems(), key=lambda d:d[1], reverse = False)
    for word in word_tfidf_dict:
        tp = []
        tp.append(word)
        tp.append(word_tfidf_dict[word])
        word_tfidf_list.append(tp)
    print('word_preprocessing-all new word number %d' % len(word_tfidf_list))
    iohelper.save_objects2pickle(word_tfidf_list, subdir, 'wordTFDict')
    print('word_preprocessing-save word_list_tfidf success!')

예제 #7

0

파일 보기

파일: classifiers_score.py 프로젝트: ma76538/Stock-SentimentAnalysis

def main():
    pos_tk_lst = iohelper.read_pickle2objects('./Reviews/pos_reviews.pkl')
    neg_tk_lst = iohelper.read_pickle2objects('./Reviews/neg_reviews.pkl')
    # 使评论集合随机分布
    shuffle(pos_tk_lst)
    shuffle(neg_tk_lst)
    posWords = list(itertools.chain(*pos_tk_lst))  #把多维数组解链成一维数组
    negWords = list(itertools.chain(*neg_tk_lst))  #同理

    # 二选一(前面是所有词，后面是所有词+双词，基于卡方检验疾进行特征选择)
    # print('1.Word Feature Selection-Chi-sq!')
    # word_scores = create_word_scores(posWords, negWords)
    print('2.Word_Plus_Bigram Feature Selection-Chi-sq!')
    pos_tk_lst = words_plus_bigram(pos_tk_lst)
    neg_tk_lst = words_plus_bigram(neg_tk_lst)
    word_scores = create_word_bigram_scores(posWords, negWords)

    global best_words
    best_words = find_best_words(word_scores, 1500)
    iohelper.save_objects2pickle(best_words, './Reviews/best_words.pkl')

    posFeatures = pos_features(
        pos_tk_lst, best_word_features
    )  # [[{'':True, '':True,...}, 'pos'], [{'':True, '':True,...}, 'neg']]
    negFeatures = neg_features(neg_tk_lst, best_word_features)
    print('POS_FEATURES_LENGTH %d\tNEG_FEATURES_LENGTH %d' %
          (len(posFeatures), len(negFeatures)))
    assert len(posFeatures) == len(negFeatures)
    print('-------------------------------------------------')

    Classifier_Type = [
        'Lexicons', 'LR', 'BernoulliNB', 'MultinomialNB', 'LinearSVC', 'NuSVC'
    ]  # 'SVC' IS CANCELLED
    (pos_lexicon_dict, neg_lexicon_dict) = rp.load_sentiment_lexicon()

    # 10_fold_cross-validation(10折交叉验证)
    cut_size = int(len(posFeatures) * 0.9)
    offset_size = len(posFeatures) - cut_size
    avg_scores = {}
    avg_precision = {}
    avg_recall = {}
    avg_time = {}
    for tp in Classifier_Type:
        avg_scores[tp] = 0.0
        avg_precision[tp] = 0.0
        avg_recall[tp] = 0.0
        avg_time[tp] = 0.0
    posTmp = []
    negTmp = []
    # 比较不同分类器的效果(主要分为基于情感词典的和基于监督式学习的)
    for tp in Classifier_Type:
        precision = 0.0
        recall = 0.0
        score = 0.0
        time = 0.0
        if tp == 'Lexicons':
            posTmp = posFeatures
            negTmp = negFeatures
            posFeatures = pos_tk_lst
            negFeatures = neg_tk_lst

        print('Classifier_Type : %s' % (tp))
        for k in range(1, 11):
            test_list = posFeatures[(k - 1) * offset_size:k *
                                    offset_size] + negFeatures[
                                        (k - 1) * offset_size:k * offset_size]
            if k == 1:
                train_list = posFeatures[k * offset_size:] + negFeatures[
                    k * offset_size:]
            elif k == 10:
                train_list = posFeatures[:(
                    k - 1) * offset_size] + negFeatures[:(k - 1) * offset_size]
            else:
                train_list = posFeatures[:(k - 1) * offset_size] + posFeatures[
                    k * offset_size:] + negFeatures[:(
                        k - 1) * offset_size] + negFeatures[k * offset_size:]

            if tp == 'Lexicons':
                test = test_list
                test_tag = ['pos' for i in range(offset_size)]
                test_tag.extend(['neg' for i in range(offset_size)])
                time, precision, recall, score = sentiment_lexicon_score(
                    pos_lexicon_dict, neg_lexicon_dict, test, test_tag)
            else:
                test, test_tag = zip(
                    *test_list
                )  # 将内部的元素list(dict和string)分解成两类tuple({}, {}, {},...)和('pos', 'pos', 'neg', ...)
                if tp == 'LR':
                    time, precision, recall, score = classifier_score(
                        tp, LogisticRegression(), train_list, test, test_tag)
                elif tp == 'BernoulliNB':
                    time, precision, recall, score = classifier_score(
                        tp, BernoulliNB(), train_list, test, test_tag)
                elif tp == 'MultinomialNB':
                    time, precision, recall, score = classifier_score(
                        tp, MultinomialNB(), train_list, test, test_tag)
                elif tp == 'LinearSVC':
                    time, precision, recall, score = classifier_score(
                        tp, LinearSVC(), train_list, test, test_tag)
                elif tp == 'NuSVC':
                    time, precision, recall, score = classifier_score(
                        tp, NuSVC(probability=True), train_list, test,
                        test_tag)
                elif tp == 'SVC':
                    precision, recall, score = classifier_score(
                        tp,
                        SVC(gamma=0.001,
                            C=100.,
                            kernel='linear',
                            probability=True), train_list, test, test_tag)
            avg_scores[tp] += score
            avg_precision[tp] += precision
            avg_recall[tp] += recall
            avg_time[tp] += time
            print(
                'The precision recall accuracy score and training time is repectively : %f %f %f %f'
                % (precision, recall, score, time))
        if tp == 'Lexicons':
            posFeatures = posTmp
            negFeatures = negTmp
            posTmp = []
            posTmp = []
        print('-------------------------------------------------')
    for tp in Classifier_Type:
        avg_scores[tp] = avg_scores[tp] / 10
        avg_precision[tp] = avg_precision[tp] / 10
        avg_recall[tp] = avg_recall[tp] / 10
        avg_time[tp] = avg_time[tp] / 10
        print ("The %s\'s average precision recall accuracy score and training time is repectively : %.2f %.2f %.2f %.2f" % \
            (tp, avg_precision[tp], avg_recall[tp], avg_scores[tp], avg_time[tp]))
    print("The End!")

예제 #8

0

파일 보기

파일: sentiment.py 프로젝트: JoshuaMichaelKing/Stock-SentimentAnalysis

def sentiment_index_compute(pos_lexicon_dict, neg_lexicon_dict):
    '''
    1. select the words to construct dictionary
    2. compute sentiment index according to the stock-oriented dictionary
    '''

    opentime1 = st.opentime1
    midclose = st.midclose
    opentime2 = st.opentime2
    closetime = st.closetime
    tick_delta = dt.timedelta(minutes=5)

    # status = raw_input("Construct lexicons or compute sentiment index? Please input yes(lexicon construction) or no(compute sentiment index)!")
    status = 'no'
    isPrint = False  # use the flag to print the word score in sentiment computing
    ml_or_lexicons = raw_input("Compute Sentiment Index:Using machine learning(yes) or lexicons(no)? Please input yes or no!")
    computeType = raw_input("Using only pos(1) or neg(2) or total(3) or logarithm(4)? Please input 1 or 2 or 3 or 4!")
    if computeType == '1':
        computeType = 'pos'
    elif computeType == '2':
        computeType = 'neg'
    elif computeType == '3':
        computeType = 'total'
    elif computeType == '4':
        computeType = 'log'

    review_list_day = []
    date_of_march = ['20160329', '20160331']
    date_of_april = ['20160405', '20160406', '20160407', '20160408',
    '20160411', '20160412', '20160413', '20160414', '20160415',
    '20160418', '20160419', '20160420', '20160421',
    '20160425', '20160426', '20160427', '20160429']
    date_of_may = ['20160503', '20160504', '20160505', '20160506',
    '20160509', '20160510', '20160511', '20160512', '20160513',
    '20160516', '20160517', '20160518', '20160519', '20160520',
    '20160523', '20160524', '20160525', '20160526', '20160527',
    '20160530', '20160531']
    # june = 10 (compute 10 days sentiment index to make correlation analysis)
    date_of_june = ['20160601', '20160602', '20160606', '20160613',
    '20160614', '20160615', '20160620', '20160622', '20160624', '20160628']
    # review_list_day.extend(date_of_march)
    # review_list_day.extend(date_of_april)
    # review_list_day.extend(date_of_may)
    review_list_day.extend(date_of_june)
    # review_list_day = ['20160420']        # just for test

    for subdir in review_list_day:
        tick_now = opentime1
        blog_corpus = []
        sentiment_index = []
        print('>>>The date to be handled : {0}'.format(subdir))
        while True:
            if (tick_now >= opentime1 and tick_now <= midclose) or (tick_now >= opentime2 and tick_now <= closetime):
                hour = tick_now.hour
                minute = tick_now.minute
                if hour is 9 and minute is 35:
                    isPrint = True
                else:
                    isPrint = False
                fname = str(hour * 100 + minute)
                blog_five_min_list = iohelper.read_txt2list(fname, subdir)
                tick_blog_list = word_tokenization(blog_five_min_list)   # convert 5-min reviews to blog list lisk this : [[,], [,],...]
                blog_corpus.extend(tick_blog_list)
                tick_now += tick_delta
                # Compute Sentiment Index
                if status != 'yes':
                    score_tmp = 0
                    if ml_or_lexicons == 'yes':
                        score_tmp = sentiment_machine_learning(tick_blog_list, computeType, isPrint)
                    else:
                        score_tmp = sentiment_lexicons_compute(pos_lexicon_dict, neg_lexicon_dict, tick_blog_list, computeType, isPrint)
                    sentiment_index.append(score_tmp)
            elif tick_now > midclose and tick_now < opentime2:
                tick_now = opentime2
            elif tick_now > closetime:
                break
        # not necessary if you have provessed it to TF-IDF word dict
        if status == 'yes':
            word_preprocessing(blog_corpus, subdir)
            print('%s : word selected from blog_corpus successfully!' % (subdir))
        else:
            iohelper.save_objects2pickle(sentiment_index, './Sentiment Index/' + subdir + '/saindex_seq.pkl')
            print('%s : save_objects2pickle successfully! %d' % (subdir, len(sentiment_index)))
    print('Ending.....')

예제 #9

0

파일 보기

파일: classifiers_score.py 프로젝트: JoshuaMichaelKing/Stock-SentimentAnalysis

def main():
    pos_tk_lst = iohelper.read_pickle2objects('./Reviews/pos_reviews.pkl')
    neg_tk_lst = iohelper.read_pickle2objects('./Reviews/neg_reviews.pkl')
    # 使评论集合随机分布
    shuffle(pos_tk_lst)
    shuffle(neg_tk_lst)
    posWords = list(itertools.chain(*pos_tk_lst)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*neg_tk_lst)) #同理

    # 二选一(前面是所有词，后面是所有词+双词，基于卡方检验疾进行特征选择)
    # print('1.Word Feature Selection-Chi-sq!')
    # word_scores = create_word_scores(posWords, negWords)
    print('2.Word_Plus_Bigram Feature Selection-Chi-sq!')
    pos_tk_lst = words_plus_bigram(pos_tk_lst)
    neg_tk_lst = words_plus_bigram(neg_tk_lst)
    word_scores = create_word_bigram_scores(posWords, negWords)

    global best_words
    best_words = find_best_words(word_scores, 1500)
    iohelper.save_objects2pickle(best_words, './Reviews/best_words.pkl')

    posFeatures = pos_features(pos_tk_lst, best_word_features)   # [[{'':True, '':True,...}, 'pos'], [{'':True, '':True,...}, 'neg']]
    negFeatures = neg_features(neg_tk_lst, best_word_features)
    print('POS_FEATURES_LENGTH %d\tNEG_FEATURES_LENGTH %d' % (len(posFeatures), len(negFeatures)))
    assert len(posFeatures) == len(negFeatures)
    print ('-------------------------------------------------')

    Classifier_Type = ['Lexicons', 'LR', 'BernoulliNB', 'MultinomialNB', 'LinearSVC', 'NuSVC']      # 'SVC' IS CANCELLED
    (pos_lexicon_dict, neg_lexicon_dict) = rp.load_sentiment_lexicon()

    # 10_fold_cross-validation(10折交叉验证)
    cut_size = int(len(posFeatures) * 0.9)
    offset_size = len(posFeatures) - cut_size
    avg_scores = {}
    avg_precision = {}
    avg_recall = {}
    avg_time = {}
    for tp in Classifier_Type:
        avg_scores[tp] = 0.0
        avg_precision[tp] = 0.0
        avg_recall[tp] = 0.0
        avg_time[tp] = 0.0
    posTmp = []
    negTmp = []
    # 比较不同分类器的效果(主要分为基于情感词典的和基于监督式学习的)
    for tp in Classifier_Type:
        precision = 0.0
        recall = 0.0
        score = 0.0
        time = 0.0
        if tp == 'Lexicons':
            posTmp = posFeatures
            negTmp = negFeatures
            posFeatures = pos_tk_lst
            negFeatures = neg_tk_lst

        print ('Classifier_Type : %s' % (tp))
        for k in range(1, 11):
            test_list = posFeatures[(k-1)*offset_size:k*offset_size] + negFeatures[(k-1)*offset_size:k*offset_size]
            if k == 1:
                train_list = posFeatures[k*offset_size:] + negFeatures[k*offset_size:]
            elif k == 10:
                train_list = posFeatures[:(k-1)*offset_size] +  negFeatures[:(k-1)*offset_size]
            else:
                train_list = posFeatures[:(k-1)*offset_size] + posFeatures[k*offset_size:] + negFeatures[:(k-1)*offset_size] + negFeatures[k*offset_size:]

            if tp == 'Lexicons':
                test = test_list
                test_tag = ['pos' for i in range(offset_size)]
                test_tag.extend(['neg' for i in range(offset_size)])
                time, precision, recall, score = sentiment_lexicon_score(pos_lexicon_dict, neg_lexicon_dict, test, test_tag)
            else:
                test, test_tag = zip(*test_list)  # 将内部的元素list(dict和string)分解成两类tuple({}, {}, {},...)和('pos', 'pos', 'neg', ...)
                if tp == 'LR':
                    time, precision, recall, score = classifier_score(tp, LogisticRegression(), train_list, test, test_tag)
                elif tp == 'BernoulliNB':
                    time, precision, recall, score = classifier_score(tp, BernoulliNB(), train_list, test, test_tag)
                elif tp == 'MultinomialNB':
                    time, precision, recall, score = classifier_score(tp, MultinomialNB(), train_list, test, test_tag)
                elif tp == 'LinearSVC':
                    time, precision, recall, score = classifier_score(tp, LinearSVC(), train_list, test, test_tag)
                elif tp == 'NuSVC':
                    time, precision, recall, score = classifier_score(tp, NuSVC(probability=True), train_list, test, test_tag)
                elif tp == 'SVC':
                    precision, recall, score = classifier_score(tp, SVC(gamma=0.001, C=100., kernel='linear', probability=True), train_list, test, test_tag)
            avg_scores[tp] += score
            avg_precision[tp] += precision
            avg_recall[tp] += recall
            avg_time[tp] += time
            print ('The precision recall accuracy score and training time is repectively : %f %f %f %f' % (precision, recall, score, time))
        if tp == 'Lexicons':
            posFeatures = posTmp
            negFeatures = negTmp
            posTmp = []
            posTmp = []
        print ('-------------------------------------------------')
    for tp in Classifier_Type:
        avg_scores[tp] = avg_scores[tp] / 10
        avg_precision[tp] = avg_precision[tp] / 10
        avg_recall[tp] = avg_recall[tp] / 10
        avg_time[tp] = avg_time[tp] / 10
        print ("The %s\'s average precision recall accuracy score and training time is repectively : %.2f %.2f %.2f %.2f" % \
            (tp, avg_precision[tp], avg_recall[tp], avg_scores[tp], avg_time[tp]))
    print ("The End!")

예제 #10

0

파일 보기

파일: sentiment.py 프로젝트: ma76538/Stock-SentimentAnalysis

def sentiment_index_compute(pos_lexicon_dict, neg_lexicon_dict):
    '''
    1. select the words to construct dictionary
    2. compute sentiment index according to the stock-oriented dictionary
    '''

    opentime1 = st.opentime1
    midclose = st.midclose
    opentime2 = st.opentime2
    closetime = st.closetime
    tick_delta = dt.timedelta(minutes=5)

    # status = raw_input("Construct lexicons or compute sentiment index? Please input yes(lexicon construction) or no(compute sentiment index)!")
    status = 'no'
    isPrint = False  # use the flag to print the word score in sentiment computing
    ml_or_lexicons = raw_input(
        "Compute Sentiment Index:Using machine learning(yes) or lexicons(no)? Please input yes or no!"
    )
    computeType = raw_input(
        "Using only pos(1) or neg(2) or total(3) or logarithm(4)? Please input 1 or 2 or 3 or 4!"
    )
    if computeType == '1':
        computeType = 'pos'
    elif computeType == '2':
        computeType = 'neg'
    elif computeType == '3':
        computeType = 'total'
    elif computeType == '4':
        computeType = 'log'

    review_list_day = []
    date_of_march = ['20160329', '20160331']
    date_of_april = [
        '20160405', '20160406', '20160407', '20160408', '20160411', '20160412',
        '20160413', '20160414', '20160415', '20160418', '20160419', '20160420',
        '20160421', '20160425', '20160426', '20160427', '20160429'
    ]
    date_of_may = [
        '20160503', '20160504', '20160505', '20160506', '20160509', '20160510',
        '20160511', '20160512', '20160513', '20160516', '20160517', '20160518',
        '20160519', '20160520', '20160523', '20160524', '20160525', '20160526',
        '20160527', '20160530', '20160531'
    ]
    # june = 10 (compute 10 days sentiment index to make correlation analysis)
    date_of_june = [
        '20160601', '20160602', '20160606', '20160613', '20160614', '20160615',
        '20160620', '20160622', '20160624', '20160628'
    ]
    # review_list_day.extend(date_of_march)
    # review_list_day.extend(date_of_april)
    # review_list_day.extend(date_of_may)
    review_list_day.extend(date_of_june)
    # review_list_day = ['20160420']        # just for test

    for subdir in review_list_day:
        tick_now = opentime1
        blog_corpus = []
        sentiment_index = []
        print('>>>The date to be handled : {0}'.format(subdir))
        while True:
            if (tick_now >= opentime1
                    and tick_now <= midclose) or (tick_now >= opentime2
                                                  and tick_now <= closetime):
                hour = tick_now.hour
                minute = tick_now.minute
                if hour is 9 and minute is 35:
                    isPrint = True
                else:
                    isPrint = False
                fname = str(hour * 100 + minute)
                blog_five_min_list = iohelper.read_txt2list(fname, subdir)
                tick_blog_list = word_tokenization(
                    blog_five_min_list
                )  # convert 5-min reviews to blog list lisk this : [[,], [,],...]
                blog_corpus.extend(tick_blog_list)
                tick_now += tick_delta
                # Compute Sentiment Index
                if status != 'yes':
                    score_tmp = 0
                    if ml_or_lexicons == 'yes':
                        score_tmp = sentiment_machine_learning(
                            tick_blog_list, computeType, isPrint)
                    else:
                        score_tmp = sentiment_lexicons_compute(
                            pos_lexicon_dict, neg_lexicon_dict, tick_blog_list,
                            computeType, isPrint)
                    sentiment_index.append(score_tmp)
            elif tick_now > midclose and tick_now < opentime2:
                tick_now = opentime2
            elif tick_now > closetime:
                break
        # not necessary if you have provessed it to TF-IDF word dict
        if status == 'yes':
            word_preprocessing(blog_corpus, subdir)
            print('%s : word selected from blog_corpus successfully!' %
                  (subdir))
        else:
            iohelper.save_objects2pickle(
                sentiment_index,
                './Sentiment Index/' + subdir + '/saindex_seq.pkl')
            print('%s : save_objects2pickle successfully! %d' %
                  (subdir, len(sentiment_index)))
    print('Ending.....')