示例#1
0
def IDF(inputFile = '../src/Pos_frequency'):
    IDFdict = {}        # record the IDF of every word
    infile = open(inputFile, 'r')
    outfile = open('IDF.out', 'w')
    end = 0
    while not end:
        line = infile.readline().strip().encode('utf-8')
        if line != '':
            print line
            line = clean(line)
            print line
            seg = jieba.cut(line)
            W = ''
            for i in seg:
                W += i + ' '
                if i not in IDFdict.keys():
                    IDFdict[i] = 1
                else:
                    IDFdict[i] += 1
            print W
        else:
            end = 1
    
    Len = len(IDFdict)                                                  #Total number of words
    for i in IDFdict.keys():
        IDFdict[i] = math.log(Len / IDFdict[i])                 # IDF of each word
        outfile.write('%s %s\n' % (i, IDFdict[i]))

    data = open('IDF.pk', 'w')            #serialize the IDF dictionary into pickle file
    pickle.dump(IDFdict, data)
    data.close()
示例#2
0
def IDF(inputFile='../src/Pos_frequency'):
    IDFdict = {}  # record the IDF of every word
    infile = open(inputFile, 'r')
    outfile = open('IDF.out', 'w')
    end = 0
    while not end:
        line = infile.readline().strip().encode('utf-8')
        if line != '':
            print line
            line = clean(line)
            print line
            seg = jieba.cut(line)
            W = ''
            for i in seg:
                W += i + ' '
                if i not in IDFdict.keys():
                    IDFdict[i] = 1
                else:
                    IDFdict[i] += 1
            print W
        else:
            end = 1

    Len = len(IDFdict)  #Total number of words
    for i in IDFdict.keys():
        IDFdict[i] = math.log(Len / IDFdict[i])  # IDF of each word
        outfile.write('%s %s\n' % (i, IDFdict[i]))

    data = open('IDF.pk', 'w')  #serialize the IDF dictionary into pickle file
    pickle.dump(IDFdict, data)
    data.close()
示例#3
0
def text2word(text):
    List = []
    label, jd = text.split('/x01')
    seg = jieba.cut(jd)
    for wd in seg:
        if wd not in symbol and wd != ' ':
            List.append(wd.encode('utf8'))
    return List
示例#4
0
def segmentation(text):
    newText = ''
    text = text.strip()
#     text = text.translate(string.maketrans("",""), string.punctuation)      # remove the punctuation
    seg = jieba.cut(text)
    for wd in seg:
        newText += wd
        newText += ' '
    return newText.strip()