def IDF(inputFile = '../src/Pos_frequency'): IDFdict = {} # record the IDF of every word infile = open(inputFile, 'r') outfile = open('IDF.out', 'w') end = 0 while not end: line = infile.readline().strip().encode('utf-8') if line != '': print line line = clean(line) print line seg = jieba.cut(line) W = '' for i in seg: W += i + ' ' if i not in IDFdict.keys(): IDFdict[i] = 1 else: IDFdict[i] += 1 print W else: end = 1 Len = len(IDFdict) #Total number of words for i in IDFdict.keys(): IDFdict[i] = math.log(Len / IDFdict[i]) # IDF of each word outfile.write('%s %s\n' % (i, IDFdict[i])) data = open('IDF.pk', 'w') #serialize the IDF dictionary into pickle file pickle.dump(IDFdict, data) data.close()
def IDF(inputFile='../src/Pos_frequency'): IDFdict = {} # record the IDF of every word infile = open(inputFile, 'r') outfile = open('IDF.out', 'w') end = 0 while not end: line = infile.readline().strip().encode('utf-8') if line != '': print line line = clean(line) print line seg = jieba.cut(line) W = '' for i in seg: W += i + ' ' if i not in IDFdict.keys(): IDFdict[i] = 1 else: IDFdict[i] += 1 print W else: end = 1 Len = len(IDFdict) #Total number of words for i in IDFdict.keys(): IDFdict[i] = math.log(Len / IDFdict[i]) # IDF of each word outfile.write('%s %s\n' % (i, IDFdict[i])) data = open('IDF.pk', 'w') #serialize the IDF dictionary into pickle file pickle.dump(IDFdict, data) data.close()
def text2word(text): List = [] label, jd = text.split('/x01') seg = jieba.cut(jd) for wd in seg: if wd not in symbol and wd != ' ': List.append(wd.encode('utf8')) return List
def segmentation(text): newText = '' text = text.strip() # text = text.translate(string.maketrans("",""), string.punctuation) # remove the punctuation seg = jieba.cut(text) for wd in seg: newText += wd newText += ' ' return newText.strip()