def loading_target_CRFs(command): if command == 'twitter': # loading target labels path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF' name_ = 'labeling_all.txt' list_line_ = filterTxt_CRF(load_file(path_, name_), command='removeLink') elif command == 'sgforums': # loading target labels path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF' name_ = 'Label_all_crf.txt' list_line_ = load_file(path_, name_) elif command == 'facebook': # loading target labels path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF' name_ = 'label.txt' list_line_ = filterTxt_CRF(load_file(path_, name_), command='removePunc') Y = np.array(load_target_label(list_line_)) print 'Finish loading target label ' + command return Y
from CRF_labeling.CRF_POS import create_part_of_speech, intersection_POS_CRF from CRF_labeling.filterText_CRF import filterTxt_CRF from CRF_running import folder_files, featuers_CRF, construct_ftr_CRF, load_target_label, n_cross_valid_crf import numpy as np if __name__ == '__main__': # TWITTER # get the text for part-of-speech and our lablled data path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF' name_POS = 'labeling_text_POS.txt' name_CRF = 'labeling_all.txt' list_ = load_file(path, name_POS) list_all = create_part_of_speech(list_) list_POS = filterTxt_CRF(list_all, command='removeLink', command_data='twitter') list_CRF = filterTxt_CRF(load_file(path, name_CRF), command='removeLink', command_data='twitter') list_new = intersection_POS_CRF(list_POS, list_CRF) list_new_POS, list_new_CRF = list_new[0], list_new[1] # note that part-of-speech will have same length now list_line_ = list_new_CRF # loading CRF features # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_POS' # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_POS_withoutREG' path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_without_POS_RE' files_ = folder_files(path_ftr) features = featuers_CRF(files_, path_ftr) X = np.array(construct_ftr_CRF(features)) # construct the features for CRF print X.shape print 'Finish loading features for CRF'
name_svc = 'ftr_reg_svc.csv' name_road = 'ftr_reg_match_road.csv' name_busstop = 'ftr_reg_match_busstop.csv' list_svc = convert_label_CRF(convert_list_CRF(load_file(path_, name_svc)), 'svc') list_road = convert_label_CRF(convert_list_CRF(load_file(path_, name_road)), 'road') list_busstop = convert_label_CRF(convert_list_CRF(load_file(path_, name_busstop)), 'busstop') CRF_F1_reg(list_line_, list_svc, list_road, list_busstop) if __name__ == '__main__': # Calculate the F1 of regular expression # SGFORUMS # path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF' # name_ = 'Label_all_crf.txt' # list_line = convert_list_CRF(load_target_label(load_file(path, name_))) # loading_reg_ftr(list_line, command='sgforums') # TWITTER # path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF' # name_ = 'labeling_all.txt' # list_line = convert_list_CRF(load_target_label(filterTxt_CRF(load_file(path_, name_), command='removeLink'))) # loading_reg_ftr(list_line, command='twitter') # FACEBOOK path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF' name_ = 'label.txt' list_line = convert_list_CRF(load_target_label(filterTxt_CRF(load_file(path_, name_), command='removePunc'))) loading_reg_ftr(list_line, command='facebook')
# path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_30' # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_70' # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_100' # path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_150' path_ftr = "D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLinkWordVector/wordVec_200" files_ = folder_files(path_ftr) features = featuers_CRF(files_, path_ftr) # construct_ftr_CRF_wordVector(features) X = np.array(construct_ftr_CRF_wordVector(features)) # construct the features for CRF print X.shape print "Finish loading features for CRF" # # # loading target labels path_ = "D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF" name_ = "labeling_all.txt" list_line_ = filterTxt_CRF(load_file(path_, name_), "removeLink", "model") Y = np.array(load_target_label(list_line_)) print Y.shape print "Finish loading target label" # # # for index in range(0, len(X)): # # print len(X[index]), len(Y[index]) # # # running CRF models # n_cross_valid_crf(X, Y, K=5, command='metrics_F1') # use to calculate the F1 for classification n_cross_valid_crf(X, Y, K=5, command="confusion_matrix") # use to calculate the F1 for classification # # n_cross_valid_crf(X, Y, K=5, command='write_results') # use to calculate the confusion matrix # # stop = timeit.default_timer() # print 'Finish running CRF model %.3f sec' % (stop - start)
# path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF' # name_clf = 'results_LR_rmLink_twitter.txt' # name_crf = 'results_CRF_rmLink_twitter.txt' # name_file = 'labeling_all.txt' # # list_clf = load_results_clf(load_file(path_, name_clf)) # list_crf = load_results_CRF(load_file(path_, name_crf)) # list_word = load_text(filterTxt_CRF(load_file(path_, name_file), command='removeLink')) # # # get_CRFwrong_CLFright(list_crf[0], list_clf[0], list_crf[1], list_word) # get_CRFright_CLFwrong(list_crf[0], list_clf[0], list_crf[1], list_word) ############################################################################ ############################################################################ # USING FOR FACEBOOK DATASET ############################################################################ # use for removing all punctuation and link path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF' name_clf = 'results_LR.txt' # results of classification name_crf = 'results_CRF.txt' # results of CRF name_file = 'label.txt' list_clf = load_results_clf(load_file(path_, name_clf)) list_crf = load_results_CRF(load_file(path_, name_crf)) # In Twitter, we used 'removeLink' but we need to used 'removePunc' in Facebook list_word = load_text(filterTxt_CRF(load_file(path_, name_file), command='removePunc')) # get_CRFwrong_CLFright(list_crf[0], list_clf[0], list_crf[1], list_word) get_CRFright_CLFwrong(list_crf[0], list_clf[0], list_crf[1], list_word)
# # n_cross_valid_clf_CRF(X, Y, clf, K=5, command='confusion_matrix') # use to calculate the confusion matrix # # n_cross_valid_clf_CRF(X, Y, clf, K=5, command='write_results') # use to calculate the confusion matrix ################################################################################################# # TWITTER # loading CRF features, remember that we only remove all punctuation and links in Twitter path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLink' files_ = folder_files(path_ftr) features = featuers_CRF(files_, path_ftr) X = np.array(convert_ftr_x_clf(features)) print len(X) # # # # # # # loading target labels path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF' name_ = 'labeling_all.txt' list_line_ = filterTxt_CRF(load_file(path_, name_), 'removeLink') # remove all punctuations & links # label_distribution(list_line_) # get the distribution of labeling in Twitter Y = np.array(convert_frt_y_clf(list_line_)) print len(Y) # print 'Loading the target labels ------------------------------------' # # # # # # # # # clf = MultinomialNB() # clf = svm.LinearSVC(C=1.0, random_state=0, class_weight='auto') # # clf = LogisticRegression(class_weight='auto') n_cross_valid_clf_CRF(X, Y, clf, K=5, command='metrics_F1') # use to calculate the F1 for classification # # n_cross_valid_clf_CRF(X, Y, clf, K=5, command='confusion_matrix') # use to calculate the confusion matrix # # # n_cross_valid_clf_CRF(X, Y, clf, K=5, command='write_results') # use to calculate the confusion matrix ################################################################################################# # FACEBOOK # loading CRF features, remember that we only remove all punctuation and links in Facebook
############################################################################ start = timeit.default_timer() # get the start time # loading CRF features path_ftr = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLink' files_ = folder_files(path_ftr) features = featuers_CRF(files_, path_ftr) X = np.array(construct_ftr_CRF(features)) # construct the features for CRF print X.shape print 'Finish loading features for CRF' # loading target labels path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF' name_ = 'labeling_all.txt' list_line_ = filterTxt_CRF(load_file(path_, name_), 'removeLink', 'model') Y = np.array(load_target_label(list_line_)) print Y.shape print 'Finish loading target label' # for index in range(0, len(X)): # print len(X[index]), len(Y[index]) # running CRF models # n_cross_valid_crf(X, Y, K=5, command='metrics_F1') # use to calculate the F1 for classification n_cross_valid_crf(X, Y, K=5, command='confusion_matrix') # use to calculate the F1 for classification # n_cross_valid_crf(X, Y, K=5, command='write_results') # use to calculate the confusion matrix stop = timeit.default_timer() print 'Finish running CRF model %.3f sec' % (stop - start)
def create_ftrList(path_write, command): if command == 'twitter_vs_sgforums' or command == 'facebook_vs_sgforums': path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF' name_ = 'Label_all_crf.txt' list_line_ = load_file(path_, name_) elif command == 'sgforums_vs_twitter' or command == 'facebook_vs_twitter': path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF' name_ = 'labeling_all.txt' list_line_ = filterTxt_CRF(load_file(path_, name_), command='removeLink') elif command == 'twitter_vs_facebook' or command == 'sgforums_vs_facebook': path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF' name_ = 'label.txt' list_line_ = filterTxt_CRF(load_file(path_, name_), command='removePunc') else: print 'Need to give the correct command' quit() # -------------- loading all tokens features type ftr_isDigit = isAllDigit(list_line_) write_file(path_write, 'ftr_isDigit', ftr_isDigit) ftr_isCharacter = isAllCharacter(list_line_) write_file(path_write, 'ftr_isCharacter', ftr_isCharacter) ftr_isBusPlate = is_busPlate(list_line_) write_file(path_write, 'ftr_isBusPlate', ftr_isBusPlate) ftr_isCapitalized = isCapitalize(list_line_) write_file(path_write, 'ftr_isCapitalized', ftr_isCapitalized) # -------------- loading all tokens match dictionary # note that the dictionary is the same in different datasets: SGFORUMS, TWITTER, FACEBOOK ftr_match_dict_svc = match_dict(list_line_, command='svc') write_file(path_write, 'ftr_match_dict_svc', ftr_match_dict_svc) ftr_match_dict_road = match_dict(list_line_, command='road') write_file(path_write, 'ftr_match_dict_road', ftr_match_dict_road) ftr_match_dict_busstop = match_dict(list_line_, command='busstop') write_file(path_write, 'ftr_match_dict_busstop', ftr_match_dict_busstop) if command == 'facebook_vs_sgforums' or command == 'facebook_vs_twitter': ftr_match_dict_busstopCode = match_dict(list_line_, command='busstopCode') write_file(path_write, 'ftr_match_dict_busstopCode', ftr_match_dict_busstopCode) # -------------- loading all tokens match bus service, road and bus stop using regular expression # note that the regular expression is the same in different datasets: SGFORUMS, TWITTER, FACEBOOK ftr_reg_svc = reg_bussvc(list_line_, n_token=10) write_file(path_write, 'ftr_reg_svc', ftr_reg_svc) ftr_match_road = match_road_busstop(list_line_, command='road') write_file(path_write, 'ftr_reg_match_road', ftr_match_road) ftr_match_busstop = match_road_busstop(list_line_, command='busstop') write_file(path_write, 'ftr_reg_match_busstop', ftr_match_busstop) # -------------- loading all token before and after match dictionary # note that the dictionary is the same in different datasets: SGFORUMS, TWITTER, FACEBOOK types = ['svc', 'road', 'busstop'] for value in types: list_command = token_bef_matchDict(list_line_, value) write_file(path_write, 'ftr_tok_bef_match_' + value, list_command) for value in types: list_command = token_aft_matchDict(list_line_, value) write_file(path_write, 'ftr_tok_aft_match_' + value, list_command) # -------------- loading all token type before and after labeling types = ['Capitalized', 'Digit', 'Character', 'BusPlate'] for value in types: list_command = token_bef_type(list_line_, value) write_file(path_write, 'ftr_tok_bef_is' + value, list_command) for value in types: list_command = token_aft_type(list_line_, value) write_file(path_write, 'ftr_tok_aft_is' + value, list_command) if command == 'twitter_vs_sgforums' or command == 'twitter_vs_facebook': # -------------- create list of features using actual word for token before in road and bus stop # use for road and bus stop types = ['road', 'busstop'] for value in types: dict_ = load_dict_token_bef_aft_Twitter(value) ftr_token_bef_road_busstop(path_write, 'ftr_token_bef_' + value, list_line_, dict_) types = ['bef_svc', 'aft_svc'] for value in types: dict_ = load_dict_token_bef_aft_Twitter(value) ftr_token_bef_aft_svc(value, path_write, 'ftr_token_' + value, list_line_, dict_) elif command == 'sgforums_vs_twitter' or command == 'sgforums_vs_facebook': # -------------- create list of features using actual word for token before in road and bus stop # use for road and bus stop types = ['road', 'busstop'] for value in types: dict_ = load_dict_token_bef_aft(value) ftr_token_bef_road_busstop(path_write, 'ftr_token_bef_' + value, list_line_, dict_) types = ['bef_svc', 'aft_svc'] for value in types: dict_ = load_dict_token_bef_aft(value) ftr_token_bef_aft_svc(value, path_write, 'ftr_token_' + value, list_line_, dict_) elif command == 'facebook_vs_twitter' or command == 'facebook_vs_sgforums': # -------------- create list of features using actual word for token before in road and bus stop # use for road and bus stop types = ['road', 'busstop'] for value in types: dict_ = load_dict_token_bef_aft_Facebook(value) ftr_token_bef_road_busstop(path_write, 'ftr_token_bef_' + value, list_line_, dict_) types = ['bef_svc', 'aft_svc'] for value in types: dict_ = load_dict_token_bef_aft_Facebook(value) ftr_token_bef_aft_svc(value, path_write, 'ftr_token_' + value, list_line_, dict_)