예제 #1
0
def classificationTest(
    train_set,
    train_label,
    test_set,
    test_label,
    lowFreqK=2,
    classifier=MultinomialNB()):  # RandomForestClassifier(n_estimators=100)
    print 'classification test processing ...'
    all = train_set[:].tolist()
    all.extend(test_set)
    # 去停用词
    stoplist = set('for a of the and to in'.split())
    allTexts = [[
        word
        for word in text.lower().replace(',', '').replace('.', '').split()
        if word not in stoplist
    ] for text in all]
    # 去低频词
    frequency = defaultdict(int)
    for text in allTexts:
        for token in text:
            frequency[token] += 1
    allTexts = [[token for token in text if frequency[token] > lowFreqK]
                for text in allTexts]

    # 构建字典
    dictionary = corpora.Dictionary(allTexts[0:len(train_set)])

    # 怎么把dict转化为列表形式的向量 http://www.mamicode.com/info-detail-1518042.html
    num_terms = len(dictionary.keys())
    all_features = dict2matrix([dictionary.doc2bow(text) for text in allTexts],
                               num_terms).toarray()
    train_data_features = all_features[0:len(train_set)]
    test_data_features = all_features[len(train_set):]

    classifier = classifier.fit(train_data_features, train_label)
    result = classifier.predict(test_data_features)

    printlabels = [1, 0]  # 这个要对应实际的类别类型
    res = [
        accuracy_score(test_label, result),
        precision_score(test_label, result, pos_label=1),
        precision_score(test_label, result, pos_label=0),
        recall_score(test_label, result, pos_label=1),
        recall_score(test_label, result, pos_label=0),
        f1_score(test_label, result, pos_label=1),
        f1_score(test_label, result, pos_label=0)
    ]
    #print result.astype(np.int).tolist()
    #print test_label.astype(np.int).tolist()
    #print confusion_matrix(test_label, result,labels=printlabels)
    return res
예제 #2
0
def classify_test_21(train_set,
                     train_label,
                     test_set,
                     test_label,
                     reverseVetorize=False):
    print 'final_sa_method:classify_test'
    all = train_set[:].tolist()
    all.extend(test_set)
    # 去停用词
    stoplist = set('for a of the and to in'.split())
    allTexts = [[
        word
        for word in text.lower().replace(',', '').replace('.', '').split()
        if word not in stoplist
    ] for text in all]
    # 去低频词
    k = 2
    frequency = defaultdict(int)
    for text in allTexts:
        for token in text:
            frequency[token] += 1
    allTexts = [[token for token in text if frequency[token] > k]
                for text in allTexts]

    # 构建字典
    dictionary = corpora.Dictionary(allTexts[0:len(train_set)])

    # 怎么把dict转化为列表形式的向量 http://www.mamicode.com/info-detail-1518042.html
    num_terms = len(dictionary.keys())
    all_features = dict2matrix([dictionary.doc2bow(text) for text in allTexts],
                               num_terms).toarray()
    train_data_features = all_features[0:len(train_set)]
    test_data_features = all_features[len(train_set):]

    #print train_data_features.toarray()

    classier = MultinomialNB()  # RandomForestClassifier(n_estimators=100)
    classier = classier.fit(train_data_features, train_label)

    print "Predicting test labels..."
    result = classier.predict(test_data_features)
    print 'result: ', result

    from sklearn.metrics import accuracy_score, confusion_matrix
    print accuracy_score(test_label, result)
    printlabels = [1, 0]  # 这个要对应实际的类别类型
    print printlabels
    print confusion_matrix(test_label, result, labels=printlabels)