Пример #1
0
    pos_bigrams_train = getBigrams(pos_tags_train)
    pos_bigrams_test = getBigrams(pos_tags_test)

    #get the unique pos bigrams from training set
    unique_bigrams = getBigramsSet(pos_bigrams_train)

    #calculate pos bigrams score for all categories
    #both dictionaries will be used for training and testing (cannot create new for testing because we don't know the labels of the new messages)
    pos_bigrams_scores_objective = posBigramsScore(unique_bigrams,0,pos_bigrams_train,labels_train)
    pos_bigrams_scores_subjective = posBigramsScore(unique_bigrams,1,pos_bigrams_train,labels_train)

    #assign a precision and F1 score to each word of to all mpqa and semeval_13 lexicons
    mpqaScores = getScores(mpqa_lexicons,process_messages_train,labels_train)

    #get features from train messages
    features_train = features_subjectivity.getFeatures(messages_train,process_messages_train,tokens_train,process_tokens_train,pos_tags_train,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_train,pos_bigrams_scores_objective,pos_bigrams_scores_subjective,mpqaScores,negationList,clusters)

    #regularize train features
    features_train=regularization.regularize(features_train)

    #get features from test messages 
    features_test = features_subjectivity.getFeatures(messages_test,process_messages_test,tokens_test,process_tokens_test,pos_tags_test,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_test,pos_bigrams_scores_objective,pos_bigrams_scores_subjective,mpqaScores,negationList,clusters)

    #regularize test features
    features_test=regularization.regularize(features_test)
else:
    # 0 - negative messages
    # 1 - positives messages
    labels_train = [0 if x=="negative" else 1 for x in labels_train]
    labels_test = [0 if x=="negative" else 1 for x in labels_test]