def clf_one_one(path, events, numFold):
    events.append('none')

    for i in range(0, numFold):
        label_test, sent_test = testing_data(path, events, i, numFold)
        print len(label_test), len(sent_test)
        list_pred, list_join = list(), list()
        for j in range(0, len(events)):
            k = j + 1
            for k in range(k, len(events)):
                # print events[j] + '_' + events[k]
                sent_train = load_file(path_, str(numFold) + 'Folds_' + events_[j] + '_' + events_[k]
                                       + '_training_' + str(i) + '.csv')
                print 'Running event: ', events[j] + '_' + events[k] + ':Fold_index_' + str(i)
                list_all = load_event_x_y(events_[j] + '_' + events_[k], sent_train, command='')
                X, Y = np.array(list_all[0]), np.array(list_all[1])
                # clf = MultinomialNB()
                # clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000)
                clf = LogisticRegression(max_iter=50000, solver='liblinear', tol=0.000001, class_weight='auto')
                list_pred.append(clf_event_one_one(clf, X, Y, sent_test, j, k))

        for m in range(0, len(list_pred)):
            if m == 0:
                list_join = list_pred[m]
            else:
                list_join = [str(x) + str(y) for x, y in zip(list_join, list_pred[m])]
        pred_label = find_max_label(list_join, events)
        list_matrix = confusion_matrix(pred_label, label_test)
        for row in list_matrix:
            line = ''
            for value in row:
                line = line + '\t' + str(value)
            print line.strip()
        list_write.append(line)

    write_file(path, 'twitter_event_' + event, list_write)

if __name__ == '__main__':
    path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_event/detectAllEvents/allTweets_ver3'
    events = ['wait', 'missing', 'skip', 'slow', 'accident', 'crowd']

    path_load = 'C:/Users/vdthoang/Google Drive/LARC - NEC Project/icwsm2016/data'
    # name_load = 'twitter.csv'
    # X_ = load_data(path_load, name_load, 'preprocessText')
    # X_id = load_data_ID(path_load, name_load)

    X_id, X_ = load_data_ubicomp()

    # print len(X_)

    for event in events:
        list_sentences = load_file(path, event + '.csv')
        print 'Running event: ', event
        list_all = load_event_x_y(event, list_sentences, command='preprocessText')

        X_train, Y_train = np.array(list_all[0]), np.array(list_all[1])
        # clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000)
        clf = LogisticRegression(max_iter=50000, solver='liblinear', tol=0.000001, class_weight='auto')
        X_pred = clf_event_predicted(X_, X_train, Y_train, clf)
        writing_pred(path_load, event, X_id, X_, X_pred)



    clf.fit(X_convert_trans, Y)  # training model
    y_pred = clf.predict(X_pred_trans)
    y_prob = clf.decision_function(X_pred_trans)

    max_prob, min_prob = max(y_prob), min(y_prob)
    list_write = list()
    for i in range(0, len(y_pred)):
        prob = (y_prob[i] - min_prob) / (max_prob - min_prob)
        print y_pred[i], prob, texts[i]

        # list_write.append(str(y_pred[i]) + '\t' + texts[i])
        list_write.append(str(y_pred[i]))

    if command == 'twitter':
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter/events_pred'
        write_file(path_write, event, list_write)


if __name__ == '__main__':
    ################################################################################################
    # TWITTER
    path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_event/detectAllEvents/allTweets_ver3'
    events = ['wait', 'missing', 'skip', 'slow', 'accident', 'crowd']

    for event in events:
        list_sentences = load_file(path, event + '.csv')
        print 'Running event: ', event
        list_all = load_event_x_y(event, list_sentences, '')
        X, Y = np.array(list_all[0]), np.array(list_all[1])
        clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000)
        event_pred_model(event, list_all[0], Y, command='twitter')
Exemplo n.º 4
0
    # Facebook
    # path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_event/detectAllEvents_ver2'
    # events_ = ['complaint', 'compliment', 'skip', 'suggestion', 'wait']
    # events_all(path_, events_)

    # running classification for event
    # Sgforums
    # path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_classification_events/detectAllEvents_ver2'
    # events_ = ['bunch', 'crowd']

    # Facebook
    path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_event/detectAllEvents_ver2'
    events_ = ['complaint', 'compliment', 'skip', 'suggestion', 'wait']
    events_.append('none')
    for i in range(0, len(events_)):
        j = i + 1
        for k in range(j, len(events_)):
            print events_[i], events_[k]
            list_sentences = load_file(path_, events_[i] + '_' + events_[k] + '.csv')
            print 'Running event: ', events_[i] + '_' + events_[k]
            list_all = load_event_x_y(events_[i] + '_' + events_[k], list_sentences, command='')
            X, Y = np.array(list_all[0]), np.array(list_all[1])
            # clf = MultinomialNB()
            # clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000)
            clf = LogisticRegression(max_iter=50000, solver='liblinear', tol=0.000001, class_weight='auto')

            # clf_event_running(X, Y, clf, K=5, command='KFold')
            # clf_event_running(path, event, 'LR', X, Y, clf, K=5, command='StratifiedKFold', call='PrintPredicted')
            # clf_event_running(path, event, 'LR', X, Y, clf, K=5, command='StratifiedKFold', call='ProbScore')
            clf_event_running(path_, events_[i] + '_' + events_[k], 'LR', X, Y, clf, K=5, command='StratifiedKFold', call='')