def tfidf_optimal(news): c = idf_optimal(news) wordlist = c.keys() tok = token__.tokenize(news) label = tok.keys() matrix = {} for t in label: matrix[t] = [] for word in range(len(c)): matrix[t].append(0) for t in label: for i in tok[t]: if i in wordlist: matrix[t][wordlist.index(i)] = tok[t].count(i) * c[i] #normalization for i in matrix: normalize(matrix[i]) with open('./tf.txt', 'w') as f: for i in matrix: f.write(str(matrix[i]) + '\n') f.close() return matrix
def idf(news): tok = token__.tokenize(news) text = [] for i in tok: text += list(set(tok[i])) idf = {} for word in text: idf[word] = math.log(float(len(text)) / float(text.count(word))) / math.log(2) with open('./idf.txt', 'w') as f: f.writelines(str(idf)) f.close() return idf