def getCityTopWords(trainCity0Rss, trainCity1Rss):
    """
    获取城市中评论中最多的词汇
    :param trainCity1Rss:
    :param trainCity0Rss:
    """
    initialDocList, fullText, cityTypes = loadRSSText(trainCity0Rss, trainCity1Rss)
    vocaList = bayes.createVocabularyList(initialDocList)
    trainVocabularyMattrix = []
    # 将训练的文档集合针对vocaList进行标记
    for words in initialDocList:
        signedFeatureList = bayes.checkSignedFeatureList(vocaList, words)
        trainVocabularyMattrix.append(signedFeatureList)

    p_WiBasedOnClass0, p_WiBasedOnClass1, pAbusive = bayes.trainNavieBayesian(trainVocabularyMattrix, cityTypes)

    topCity0Words = []
    topCity1Words = []
    for i in range(len(p_WiBasedOnClass0)):
        if p_WiBasedOnClass0[i] > -6.0:
            topCity0Words.append(vocaList[i])
        if p_WiBasedOnClass1[i] > -6.0:
            topCity1Words.append(vocaList[i])

    print "*******City0最常用20的词汇*********"
    for word in topCity0Words[:20]:
        print word
    print "*******City1最常用的词汇*********"
    for word in topCity1Words[:20]:
        print word
def classifyNavieBayesianTest():
    wordsList, classTypes = bayes.loadDataSet()
    inputTestWords = ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']
    result = bayes.classifyNavieBayesian(wordsList, classTypes, inputTestWords)
    print inputTestWords, ':', result
    inputTestWords2 = ['love', 'stupid']
    result2 = bayes.classifyNavieBayesian(wordsList, classTypes, inputTestWords2)
    print inputTestWords2, ':', result2
def trainNavieBayesianTest():
    wordsList, classTypes = bayes.loadDataSet()
    vocaList = bayes.createVocabularyList(wordsList)
    # 将feature对应的标记为0,1
    trainVocabularyMattrix = []
    for words in wordsList:
        trainVocabularyMattrix.append(bayes.checkSignedFeatureList(vocaList, words))

    # print np.array(trainVocabularyMattrix)
    p_WiBasedOnClass0, p_WiBasedOnClass1, pAbusive = bayes.trainNavieBayesian(trainVocabularyMattrix, classTypes)
    print p_WiBasedOnClass0, '\n'
    print p_WiBasedOnClass1
    print pAbusive
def filterSpamEmail():
    """
    过滤垃圾邮件
    :return:
    """
    initialDocList, classTypes = loadEmailText()
    # 从initialDocList中随机创建10个待测试的文档
    testDocList = []
    # 待测试邮件的类型
    testDocClassList = []
    """
    注意此处随机选择10封email,添加到测试集合,同时将原有的数据集删除,
    这种随机选择数据的一部分作为训练集合,而剩余部分作为测试集合的过程称为
    留存交叉验证:hold-out cross validation
    """
    for i in range(10):
        randomIndex = int(random.uniform(0, len(initialDocList)))
        testDocClassList.append(classTypes[randomIndex])
        testDocList.append(initialDocList[randomIndex])
        del (initialDocList[randomIndex])
        del (classTypes[randomIndex])

    errorCount = 0
    for i in range(len(testDocList)):
        # 对给定的待测试的邮件进行分类
        classType = bayes.classifyNavieBayesian(
                initialDocList, classTypes, testDocList[i])
        if classType != testDocClassList[i]:  # 预测的结果和实际的结果进行比较
            print '分类错误的邮件:', testDocList[i], '\n属于', testDocClassList[i], \
                '错误分类成了:', classType
            errorCount += 1

    # 计算分类的误差
    print 'the error rate is :', float(errorCount) / len(testDocList)
def localWordsTest(city0Rss, city1Rss):
    """
    测试根据输入的text分类城市的准确率
    :param city0Rss:
    :param city1Rss:
    过滤垃圾邮件
    :return:
    """
    initialDocList, fullText, cityTypes = loadRSSText(city0Rss, city1Rss)
    voclist = bayes.createVocabularyList(initialDocList)
    print "未删除高频词汇的词汇表长度:", len(voclist)
    # 出现频率最高的词汇,例如:I and 等辅助词
    deletedVoc = calcFrequentWords(voclist, fullText)
    # 去除词汇列表的高频词汇
    for word in deletedVoc:
        if word[0] in voclist:
            voclist.remove(word[0])
    print "删除后的词汇表长度:", len(voclist)

    # 从initialDocList中随机创建10个待测试的文档
    testDocList = []
    # 待测试邮件的类型
    testDocClassList = []
    """
    注意此处随机选择10个数据,添加到测试集合,同时将原有的数据集删除,
    这种随机选择数据的一部分作为训练集合,而剩余部分作为测试集合的过程称为
    留存交叉验证:hold-out cross validation
    """
    for i in range(10):
        randomIndex = int(random.uniform(0, len(initialDocList)))
        testDocClassList.append(cityTypes[randomIndex])
        testDocList.append(initialDocList[randomIndex])
        del (initialDocList[randomIndex])
        del (cityTypes[randomIndex])

    errorCount = 0
    for j in range(len(testDocList)):
        classType = bayes.classifyNavieBayesian2(voclist, initialDocList, cityTypes, testDocList[j])
        if classType != testDocClassList[j]:  # 预测的结果和实际的结果进行比较
            print "分类错误的信息:", testDocList[j], "\n属于", testDocClassList[j], "错误分类成了:", classType
            errorCount += 1

    # 计算分类的误差
    errorRate = float(errorCount) / len(testDocList)
    print "the error rate is :", errorRate
    return errorRate
def createWordSetTest():
    wordsList, classTypes = bayes.loadDataSet()
    print wordsList
    wordsetList = bayes.createVocabularyList(wordsList)
    print wordsetList
    return wordsetList