示例#1
0
def extract_features(data):
    import extractFeatures as ef
    best_words = get_best_words()
    feat = []
    for i in data:
        feat.append(ef.best_word_features(i,best_words))
    return feat
import itertools 
import evalueClassier as ec
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
# select positive and negative features.
pos_review = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
neg_review = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
neg_review = neg_review*3
pos = pos_review[:50]
neg = neg_review[:50]

word_scores = ef.create_word_scores(pos,neg,'pos','neg')
best_words = ef.find_best_words(word_scores, 1000)
posFeatures = []
for p in pos:
    pos_selected = ef.best_word_features(p,best_words)
    posFeatures.append(ef.tagFeatures(pos_selected,'pos'))
negFeatures = []
for n in neg:
    neg_selected = ef.best_word_features(n,best_words)
    negFeatures.append(ef.tagFeatures(neg_selected,'neg'))

# divide Features into train devtest and test sets.
trainSet = posFeatures[:50]+negFeatures[:50]
devtestSet = posFeatures[40:50]+negFeatures[40:50]
testSet = posFeatures[40:50]+negFeatures[40:50]
print testSet
classifer_dict={'BernoulliNB':BernoulliNB(),'MultinomialNB':MultinomialNB(),'LogisticRegression':LogisticRegression(),'SVC':SVC(),'LinearSVC':LinearSVC(),'NuSVC':NuSVC()}
for classiferName,classiferFunc in classifer_dict.items():
    ec.showEvalueResult(trainSet,devtestSet,classiferName,classiferFunc)
    open(
        '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl',
        'r'))
neg_review = pickle.load(
    open(
        '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl',
        'r'))
neg_review = neg_review * 3
pos = pos_review[:50]
neg = neg_review[:50]

word_scores = ef.create_word_scores(pos, neg, 'pos', 'neg')
best_words = ef.find_best_words(word_scores, 1000)
posFeatures = []
for p in pos:
    pos_selected = ef.best_word_features(p, best_words)
    posFeatures.append(ef.tagFeatures(pos_selected, 'pos'))
negFeatures = []
for n in neg:
    neg_selected = ef.best_word_features(n, best_words)
    negFeatures.append(ef.tagFeatures(neg_selected, 'neg'))
# divide Features into train devtest and test sets.
trainSet = posFeatures[:50] + negFeatures[:50]
devtestSet = posFeatures[40:50] + negFeatures[40:50]
testSet = posFeatures[40:50] + negFeatures[40:50]
#Train and save classifier
NuSVC_classifier = SklearnClassifier(NuSVC(probability=True))
NuSVC_classifier.train(trainSet)
pickle.dump(
    NuSVC_classifier,
    open(