def parse(stat, path='mirror/', n_news=10000): stopWord = StopWord.getStopWord() print(str(stopWord)) lastDoc = [] for number in range(1, n_news+1): filename = path + str(number) + '.txt' with open(filename, 'rb') as fin: if fin: s = fin.readline() # title print(number, s) s = fin.readline() # body termList = re.split('[^a-zA-Z]+', s) pass s = fin.readline() # category if s in stat.cats: for item in termList: item = item.lower() if not ((item in stopWord) or (len(item) == 1)): stat.catTermAmount[stat.cats[s]] += 1 if not (item in stat.terms): stat.termToInt[item] = len(stat.terms) stat.terms.append(item) stat.termInDoc.append(0) stat.termAmount.append(0) lastDoc.append(-1) stat.totalTerm += 1 no = stat.termToInt[item] if lastDoc[no] != number: lastDoc[no] = number stat.termInDoc[no] += 1 stat.termAmount[no] += 1 stat.termInCat[stat.cats[s]][no] += 1
def parse(stat, path='mirror/', n_news=10000): stopWord = StopWord.getStopWord() print(str(stopWord)) lastDoc = [] for number in range(1, n_news + 1): filename = path + str(number) + '.txt' with open(filename, 'rb') as fin: if fin: s = fin.readline() # title print(number, s) s = fin.readline() # body termList = re.split('[^a-zA-Z]+', s) pass s = fin.readline() # category if s in stat.cats: for item in termList: item = item.lower() if not ((item in stopWord) or (len(item) == 1)): stat.catTermAmount[stat.cats[s]] += 1 if not (item in stat.terms): stat.termToInt[item] = len(stat.terms) stat.terms.append(item) stat.termInDoc.append(0) stat.termAmount.append(0) lastDoc.append(-1) stat.totalTerm += 1 no = stat.termToInt[item] if lastDoc[no] != number: lastDoc[no] = number stat.termInDoc[no] += 1 stat.termAmount[no] += 1 stat.termInCat[stat.cats[s]][no] += 1
def test(stat, path='', n_test=10): allCat = {'Crime and law': 0, 'Culture and entertainment': 0, 'Disasters and accidents': 0, 'Science and technology': 0, 'Health': 0} callBack = dict(allCat) callAll = dict(allCat) stopWord = StopWord.getStopWord() termSum = len(stat.terms) correct = 0 wrong = 0 for n in range(1, n_test+1): filename = path + str(n) + '.txt' with open(filename, 'rb') as fin: title = fin.readline().strip() termList = re.split('[^a-zA-Z]+', fin.readline()) maxi = 0 toCat = '' for cat in stat.cats: # noC = stat.cats[cat] p = 0.0 for t in termList: t = t.lower() if not (t in stopWord or len(t) == 1): if t in stat.terms: noT = stat.termToInt[t] p += math.log(1.0 * (stat.termInCat[noC][noT] + 1) / (stat.catTermAmount[noC] + termSum)) p += math.log(1.0 * (stat.catTermAmount[noC] + 0.01) / stat.totalTerm) if p > maxi or toCat == '': maxi = p toCat = cat cat = fin.readline().strip() if cat in stat.cats: allCat[cat] += 1 callAll[toCat] += 1 if toCat == cat: callBack[cat] += 1 correct += 1 print(title + ' : ' + cat + ' toCat: ' + toCat + ' Yes') else: wrong += 1 print(title + ' : ' + cat + ' toCat: ' + toCat + ' No') print('\nTotal Precision: correct / total = %d / %d' % (correct, correct + wrong)) for cat in allCat: print('[' + cat + ']') if callAll[cat] > 0: p = callBack[cat] * 100.0 / callAll[cat] else: p = -1 if allCat[cat] > 0: r = callBack[cat] * 100.0 / allCat[cat] else: r = -1 print('Precision : %d / %d = %.3f%%' % (callBack[cat], callAll[cat], p)) print('Recall : %d / %d = %.3f%%' % (callBack[cat], allCat[cat], r)) print('F = %.3f%%' % (2.0 * p * r / (p + r)))
def test(stat, path='', n_test=10): allCat = { 'Crime and law': 0, 'Culture and entertainment': 0, 'Disasters and accidents': 0, 'Science and technology': 0, 'Health': 0 } callBack = dict(allCat) callAll = dict(allCat) stopWord = StopWord.getStopWord() termSum = len(stat.terms) correct = 0 wrong = 0 for n in range(1, n_test + 1): filename = path + str(n) + '.txt' with open(filename, 'rb') as fin: title = fin.readline().strip() termList = re.split('[^a-zA-Z]+', fin.readline()) maxi = 0 toCat = '' for cat in stat.cats: # noC = stat.cats[cat] p = 0.0 for t in termList: t = t.lower() if not (t in stopWord or len(t) == 1): if t in stat.terms: noT = stat.termToInt[t] p += math.log(1.0 * (stat.termInCat[noC][noT] + 1) / (stat.catTermAmount[noC] + termSum)) p += math.log(1.0 * (stat.catTermAmount[noC] + 0.01) / stat.totalTerm) if p > maxi or toCat == '': maxi = p toCat = cat cat = fin.readline().strip() if cat in stat.cats: allCat[cat] += 1 callAll[toCat] += 1 if toCat == cat: callBack[cat] += 1 correct += 1 print(title + ' : ' + cat + ' toCat: ' + toCat + ' Yes') else: wrong += 1 print(title + ' : ' + cat + ' toCat: ' + toCat + ' No') print('\nTotal Precision: correct / total = %d / %d' % (correct, correct + wrong)) for cat in allCat: print('[' + cat + ']') if callAll[cat] > 0: p = callBack[cat] * 100.0 / callAll[cat] else: p = -1 if allCat[cat] > 0: r = callBack[cat] * 100.0 / allCat[cat] else: r = -1 print('Precision : %d / %d = %.3f%%' % (callBack[cat], callAll[cat], p)) print('Recall : %d / %d = %.3f%%' % (callBack[cat], allCat[cat], r)) print('F = %.3f%%' % (2.0 * p * r / (p + r)))