def tfidf_knn(news, category):
    c = idf_knn(news, category)
    wordlist = c.keys()
    tok = token__.tokenize2(news, category)
    label = tok.keys()
    matrix = {}
    print 'hello'
    for topic in label:
        matrix[topic] = {}
        for t in tok[topic].keys():
            matrix[topic][t] = []
            for i in range(len(c)):
                matrix[topic][t].append(0)
    print 'world'
    for topic in label:
        for i in tok[topic].keys():
            for word in tok[topic][i]:
                if word in wordlist:
                    matrix[topic][i][wordlist.index(i)] = tok[topic][i].count(i) * c[word]
    print 'today'
    #normalization
    for topic in matrix:
        for i in matrix[topic]:
            normalize(matrix[topic][i])
    return matrix
def idf_knn(news, category):
    tok = token__.tokenize2(news, category)
    text = []
    for topic in tok:
        for i in tok[topic]:
            text += list(set(tok[topic][i]))

    idf = {}
    for word in text:
        idf[word] = math.log(float(len(text)) / float(text.count(word))) / math.log(2)
    print len(idf)
    return idf
Exemplo n.º 3
0
def classify(guess, p, category):
    cl = {}
    tok = token__.tokenize2(guess, category)
    for i in range(len(tok)):
        cl[i] = {}
        for j in tok[i].keys():
            cl[i][j] = {}
            for k in range(len(category)):
                cl[i][j][k] = 0
                for word in tok[i][j]:
                    if word in p:
                        cl[i][j][k] += p[word][k]

    result = {}
    for i in range(len(cl)):
        result[i] = {}
        for j in cl[i].keys():
            for k in cl[i][j]:
                if cl[i][j][k] == max(cl[i][j].values()):
                    result[i][j] = k

    for i in range(len(result)):
        print category[i] + ':'
        for j in result[i].keys():
            print '  ' + category[result[i][j]] + '  ' + j
    TP_ = 0 #overall TP
    FP_ = 0 #overall FP
    FN_ = 0 #overall FN
    for i in range(len(result)):
        TP = result[i].values().count(i)
        FP = len(result[i].values()) - TP
        FN = 0
        for j in range(len(result)):
            if j != i:
                FN += result[j].values().count(i)
        print category[i] + ': ' + 'TP = ' + str(TP) + ',FP = ' + str(FP) + ',FN = ' + str(FN)
        print '         Precision = ' + str(float(TP) / float(TP + FP)) + ', Recall = ' + str(float(TP) / float(TP + FN))
        print '         F1 = ' + str(float(2 * TP) / float(2 * TP + FP + FN))
        TP_ += TP
        FP_ += FP
        FN_ += FN
    print '         Microaveraged F1 = ' + str(float(2 * TP_) / float(2 * TP_ + FP_ + FN_))
def bayes(news, category):
    tok = token__.tokenize2(news, category)
    text = []
    text_c = {}
    s = []
    for i in range(len(tok)):
        tmp = 0
        text_c[i] = []
        for doc in tok[i]:
            text += tok[i][doc]
            text_c[i] += tok[i][doc]
            tmp += len(tok[i][doc])
        s.append(tmp)
    text = list(set(text))
    p = {}
    for word in text:
        p[word] = {}
        for i in range(len(text_c)):
            p[word][i] = math.log(float(text_c[i].count(word) + 1) / float(s[i] + len(text)))
    return p
def not_naive(news, category):
    tok = token__.tokenize2(news, category)
    text = []
    text_c = {}
    text_raw = []
    s = []
    for i in range(len(tok)):
        tmp = 0
        text_c[i] = []
        for doc in tok[i]:
            text += tok[i][doc]
            text_c[i] += tok[i][doc]
            text_raw += tok[i][doc]
            tmp += len(tok[i][doc])
        s.append(tmp)
    text = list(set(text))
    p = {}
    for word in text:
        p[word] = {}
        for i in range(len(text_c)):
            p[word][i] = math.log(float(text_c[i].count(word) + 1) / float(s[i] + len(text)))

    p_optimal = {}
    for word in p:
        for i in p[word].keys():
            if p[word][i] == max(p[word].values()):
                a = text_c[i].count(word)
                c = text_raw.count(word) * 0.5
                if a >= c:
                    p_optimal[word] = {}

    for word in p_optimal:
        for i in range(len(text_c)):
            p_optimal[word][i] = p[word][i]

    return p_optimal