def loading_target_CRFs(command):
    if command == 'twitter':
        # loading target labels
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF'
        name_ = 'labeling_all.txt'
        list_line_ = filterTxt_CRF(load_file(path_, name_), command='removeLink')

    elif command == 'sgforums':
        # loading target labels
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF'
        name_ = 'Label_all_crf.txt'
        list_line_ = load_file(path_, name_)

    elif command == 'facebook':
        # loading target labels
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF'
        name_ = 'label.txt'
        list_line_ = filterTxt_CRF(load_file(path_, name_), command='removePunc')

    Y = np.array(load_target_label(list_line_))
    print 'Finish loading target label ' + command
    return Y
from CRF_labeling.CRF_POS import create_part_of_speech, intersection_POS_CRF
from CRF_labeling.filterText_CRF import filterTxt_CRF
from CRF_running import folder_files, featuers_CRF, construct_ftr_CRF, load_target_label, n_cross_valid_crf
import numpy as np


if __name__ == '__main__':
    # TWITTER
    # get the text for part-of-speech and our lablled data
    path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF'
    name_POS = 'labeling_text_POS.txt'
    name_CRF = 'labeling_all.txt'

    list_ = load_file(path, name_POS)
    list_all = create_part_of_speech(list_)
    list_POS = filterTxt_CRF(list_all, command='removeLink', command_data='twitter')
    list_CRF = filterTxt_CRF(load_file(path, name_CRF), command='removeLink', command_data='twitter')

    list_new = intersection_POS_CRF(list_POS, list_CRF)
    list_new_POS, list_new_CRF = list_new[0], list_new[1]  # note that part-of-speech will have same length now
    list_line_ = list_new_CRF

    # loading CRF features
    # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_POS'
    # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_POS_withoutREG'
    path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_without_POS_RE'
    files_ = folder_files(path_ftr)
    features = featuers_CRF(files_, path_ftr)
    X = np.array(construct_ftr_CRF(features))  # construct the features for CRF
    print X.shape
    print 'Finish loading features for CRF'
    name_svc = 'ftr_reg_svc.csv'
    name_road = 'ftr_reg_match_road.csv'
    name_busstop = 'ftr_reg_match_busstop.csv'

    list_svc = convert_label_CRF(convert_list_CRF(load_file(path_, name_svc)), 'svc')
    list_road = convert_label_CRF(convert_list_CRF(load_file(path_, name_road)), 'road')
    list_busstop = convert_label_CRF(convert_list_CRF(load_file(path_, name_busstop)), 'busstop')
    CRF_F1_reg(list_line_, list_svc, list_road, list_busstop)


if __name__ == '__main__':
    # Calculate the F1 of regular expression
    # SGFORUMS
    # path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF'
    # name_ = 'Label_all_crf.txt'
    # list_line = convert_list_CRF(load_target_label(load_file(path, name_)))
    # loading_reg_ftr(list_line, command='sgforums')

    # TWITTER
    # path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF'
    # name_ = 'labeling_all.txt'
    # list_line = convert_list_CRF(load_target_label(filterTxt_CRF(load_file(path_, name_), command='removeLink')))
    # loading_reg_ftr(list_line, command='twitter')

    # FACEBOOK
    path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF'
    name_ = 'label.txt'
    list_line = convert_list_CRF(load_target_label(filterTxt_CRF(load_file(path_, name_), command='removePunc')))
    loading_reg_ftr(list_line, command='facebook')
    # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_30'
    # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_70'
    # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_100'
    # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_150'
    path_ftr = "D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_200"
    files_ = folder_files(path_ftr)
    features = featuers_CRF(files_, path_ftr)
    # construct_ftr_CRF_wordVector(features)
    X = np.array(construct_ftr_CRF_wordVector(features))  # construct the features for CRF
    print X.shape
    print "Finish loading features for CRF"
    #
    # # loading target labels
    path_ = "D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF"
    name_ = "labeling_all.txt"
    list_line_ = filterTxt_CRF(load_file(path_, name_), "removeLink", "model")
    Y = np.array(load_target_label(list_line_))
    print Y.shape
    print "Finish loading target label"
    #
    # # for index in range(0, len(X)):
    # #     print len(X[index]), len(Y[index])
    #
    # # running CRF models
    # n_cross_valid_crf(X, Y, K=5, command='metrics_F1')  # use to calculate the F1 for classification
    n_cross_valid_crf(X, Y, K=5, command="confusion_matrix")  # use to calculate the F1 for classification
    # # n_cross_valid_crf(X, Y, K=5, command='write_results')  # use to calculate the confusion matrix
    #
    # stop = timeit.default_timer()
    # print 'Finish running CRF model %.3f sec' % (stop - start)
    # path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF'
    # name_clf = 'results_LR_rmLink_twitter.txt'
    # name_crf = 'results_CRF_rmLink_twitter.txt'
    # name_file = 'labeling_all.txt'
    #
    # list_clf = load_results_clf(load_file(path_, name_clf))
    # list_crf = load_results_CRF(load_file(path_, name_crf))
    # list_word = load_text(filterTxt_CRF(load_file(path_, name_file), command='removeLink'))
    #
    # # get_CRFwrong_CLFright(list_crf[0], list_clf[0], list_crf[1], list_word)
    # get_CRFright_CLFwrong(list_crf[0], list_clf[0], list_crf[1], list_word)


    ############################################################################
    ############################################################################
    # USING FOR FACEBOOK DATASET
    ############################################################################
    # use for removing all punctuation and link
    path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF'
    name_clf = 'results_LR.txt'  # results of classification
    name_crf = 'results_CRF.txt'  # results of CRF
    name_file = 'label.txt'

    list_clf = load_results_clf(load_file(path_, name_clf))
    list_crf = load_results_CRF(load_file(path_, name_crf))
    # In Twitter, we used 'removeLink' but we need to used 'removePunc' in Facebook
    list_word = load_text(filterTxt_CRF(load_file(path_, name_file), command='removePunc'))

    # get_CRFwrong_CLFright(list_crf[0], list_clf[0], list_crf[1], list_word)
    get_CRFright_CLFwrong(list_crf[0], list_clf[0], list_crf[1], list_word)
示例#6
0
    # # n_cross_valid_clf_CRF(X, Y, clf, K=5, command='confusion_matrix')  # use to calculate the confusion matrix
    # # n_cross_valid_clf_CRF(X, Y, clf, K=5, command='write_results')  # use to calculate the confusion matrix

    #################################################################################################
    # TWITTER
    # loading CRF features, remember that we only remove all punctuation and links in Twitter
    path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLink'
    files_ = folder_files(path_ftr)
    features = featuers_CRF(files_, path_ftr)
    X = np.array(convert_ftr_x_clf(features))
    print len(X)
    # # #
    # # # # loading target labels
    path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF'
    name_ = 'labeling_all.txt'
    list_line_ = filterTxt_CRF(load_file(path_, name_), 'removeLink')  # remove all punctuations & links
    # label_distribution(list_line_)  # get the distribution of labeling in Twitter
    Y = np.array(convert_frt_y_clf(list_line_))
    print len(Y)
    # print 'Loading the target labels ------------------------------------'
    # # # #
    # # # # # clf = MultinomialNB()
    # clf = svm.LinearSVC(C=1.0, random_state=0, class_weight='auto')
    # # clf = LogisticRegression(class_weight='auto')
    n_cross_valid_clf_CRF(X, Y, clf, K=5, command='metrics_F1')  # use to calculate the F1 for classification
    # # n_cross_valid_clf_CRF(X, Y, clf, K=5, command='confusion_matrix')  # use to calculate the confusion matrix
    # # # n_cross_valid_clf_CRF(X, Y, clf, K=5, command='write_results')  # use to calculate the confusion matrix

    #################################################################################################
    # FACEBOOK
    # loading CRF features, remember that we only remove all punctuation and links in Facebook
    ############################################################################
    start = timeit.default_timer()  # get the start time

    # loading CRF features
    path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLink'
    files_ = folder_files(path_ftr)
    features = featuers_CRF(files_, path_ftr)
    X = np.array(construct_ftr_CRF(features))  # construct the features for CRF
    print X.shape
    print 'Finish loading features for CRF'

    # loading target labels
    path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF'
    name_ = 'labeling_all.txt'
    list_line_ = filterTxt_CRF(load_file(path_, name_), 'removeLink', 'model')
    Y = np.array(load_target_label(list_line_))
    print Y.shape
    print 'Finish loading target label'

    # for index in range(0, len(X)):
    #     print len(X[index]), len(Y[index])

    # running CRF models
    # n_cross_valid_crf(X, Y, K=5, command='metrics_F1')  # use to calculate the F1 for classification
    n_cross_valid_crf(X, Y, K=5, command='confusion_matrix')  # use to calculate the F1 for classification
    # n_cross_valid_crf(X, Y, K=5, command='write_results')  # use to calculate the confusion matrix

    stop = timeit.default_timer()
    print 'Finish running CRF model %.3f sec' % (stop - start)
def create_ftrList(path_write, command):
    if command == 'twitter_vs_sgforums' or command == 'facebook_vs_sgforums':
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF'
        name_ = 'Label_all_crf.txt'
        list_line_ = load_file(path_, name_)
    elif command == 'sgforums_vs_twitter' or command == 'facebook_vs_twitter':
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF'
        name_ = 'labeling_all.txt'
        list_line_ = filterTxt_CRF(load_file(path_, name_), command='removeLink')
    elif command == 'twitter_vs_facebook' or command == 'sgforums_vs_facebook':
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF'
        name_ = 'label.txt'
        list_line_ = filterTxt_CRF(load_file(path_, name_), command='removePunc')
    else:
        print 'Need to give the correct command'
        quit()

    # -------------- loading all tokens features type
    ftr_isDigit = isAllDigit(list_line_)
    write_file(path_write, 'ftr_isDigit', ftr_isDigit)
    ftr_isCharacter = isAllCharacter(list_line_)
    write_file(path_write, 'ftr_isCharacter', ftr_isCharacter)
    ftr_isBusPlate = is_busPlate(list_line_)
    write_file(path_write, 'ftr_isBusPlate', ftr_isBusPlate)
    ftr_isCapitalized = isCapitalize(list_line_)
    write_file(path_write, 'ftr_isCapitalized', ftr_isCapitalized)

    # -------------- loading all tokens match dictionary
    # note that the dictionary is the same in different datasets: SGFORUMS, TWITTER, FACEBOOK
    ftr_match_dict_svc = match_dict(list_line_, command='svc')
    write_file(path_write, 'ftr_match_dict_svc', ftr_match_dict_svc)
    ftr_match_dict_road = match_dict(list_line_, command='road')
    write_file(path_write, 'ftr_match_dict_road', ftr_match_dict_road)
    ftr_match_dict_busstop = match_dict(list_line_, command='busstop')
    write_file(path_write, 'ftr_match_dict_busstop', ftr_match_dict_busstop)

    if command == 'facebook_vs_sgforums' or command == 'facebook_vs_twitter':
        ftr_match_dict_busstopCode = match_dict(list_line_, command='busstopCode')
        write_file(path_write, 'ftr_match_dict_busstopCode', ftr_match_dict_busstopCode)

    # -------------- loading all tokens match bus service, road and bus stop using regular expression
    # note that the regular expression is the same in different datasets: SGFORUMS, TWITTER, FACEBOOK
    ftr_reg_svc = reg_bussvc(list_line_, n_token=10)
    write_file(path_write, 'ftr_reg_svc', ftr_reg_svc)
    ftr_match_road = match_road_busstop(list_line_, command='road')
    write_file(path_write, 'ftr_reg_match_road', ftr_match_road)
    ftr_match_busstop = match_road_busstop(list_line_, command='busstop')
    write_file(path_write, 'ftr_reg_match_busstop', ftr_match_busstop)

    # -------------- loading all token before and after match dictionary
    # note that the dictionary is the same in different datasets: SGFORUMS, TWITTER, FACEBOOK
    types = ['svc', 'road', 'busstop']
    for value in types:
        list_command = token_bef_matchDict(list_line_, value)
        write_file(path_write, 'ftr_tok_bef_match_' + value, list_command)

    for value in types:
        list_command = token_aft_matchDict(list_line_, value)
        write_file(path_write, 'ftr_tok_aft_match_' + value, list_command)

    # -------------- loading all token type before and after labeling
    types = ['Capitalized', 'Digit', 'Character', 'BusPlate']
    for value in types:
        list_command = token_bef_type(list_line_, value)
        write_file(path_write, 'ftr_tok_bef_is' + value, list_command)

    for value in types:
        list_command = token_aft_type(list_line_, value)
        write_file(path_write, 'ftr_tok_aft_is' + value, list_command)

    if command == 'twitter_vs_sgforums' or command == 'twitter_vs_facebook':
        # -------------- create list of features using actual word for token before in road and bus stop
        # use for road and bus stop
        types = ['road', 'busstop']
        for value in types:
            dict_ = load_dict_token_bef_aft_Twitter(value)
            ftr_token_bef_road_busstop(path_write, 'ftr_token_bef_' + value, list_line_, dict_)

        types = ['bef_svc', 'aft_svc']
        for value in types:
            dict_ = load_dict_token_bef_aft_Twitter(value)
            ftr_token_bef_aft_svc(value, path_write, 'ftr_token_' + value, list_line_, dict_)

    elif command == 'sgforums_vs_twitter' or command == 'sgforums_vs_facebook':
        # -------------- create list of features using actual word for token before in road and bus stop
        # use for road and bus stop
        types = ['road', 'busstop']
        for value in types:
            dict_ = load_dict_token_bef_aft(value)
            ftr_token_bef_road_busstop(path_write, 'ftr_token_bef_' + value, list_line_, dict_)

        types = ['bef_svc', 'aft_svc']
        for value in types:
            dict_ = load_dict_token_bef_aft(value)
            ftr_token_bef_aft_svc(value, path_write, 'ftr_token_' + value, list_line_, dict_)

    elif command == 'facebook_vs_twitter' or command == 'facebook_vs_sgforums':
        # -------------- create list of features using actual word for token before in road and bus stop
        # use for road and bus stop
        types = ['road', 'busstop']
        for value in types:
            dict_ = load_dict_token_bef_aft_Facebook(value)
            ftr_token_bef_road_busstop(path_write, 'ftr_token_bef_' + value, list_line_, dict_)

        types = ['bef_svc', 'aft_svc']
        for value in types:
            dict_ = load_dict_token_bef_aft_Facebook(value)
            ftr_token_bef_aft_svc(value, path_write, 'ftr_token_' + value, list_line_, dict_)