Пример #1
0
def makeVectors():
    vectorsList = deconstruct.de()
    fw = open('./weightVector.txt','rb')
    text = fw.read()
    text = text.decode('utf8','ignore')
    fw.close()
    text = text.split(',')
    weight = [float(x) for x in text]
    fw = open('./eigenWord.txt','rb')
    text = fw.read()
    text = text.decode('utf8','ignore')
    fw.close()
    words = text.split('\r\n')
    result = []
    for key in vectorsList.keys():
        vector = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        vector[20] = float(vectorsList[key])
        if '人民日报' in key:
            fw = open(key, 'rb')
            text = fw.read()
            text = text.decode('utf8', 'ignore')
            fw.close()
        else:
            newText = textDeconstruct.deconstruct(key)
            text = newText['text']
        text = re.split('[|]', text)
        for i in range(len(words)):
            if words[i] in text:
                vector[i] = weight[i]
        vectorsList[key] = vector
        result.append('({0},{1})'.format(key, str(vector)))
    fw = open('vectors.txt', 'w', 1, 'utf8')
    fw.write('\n'.join(result))
    fw.close()
Пример #2
0
def aimD(aimWord,class_):
    num = 0
    vectorsList = deconstruct.de()
    for key in vectorsList.keys():
        if class_ == 0 or class_ == vectorsList[key]:
            if '人民日报' in key:
                fw = open(key, 'rb')
                text = fw.read()
                try:
                    text = text.decode('utf8')
                except Exception:
                    text = text.decode('utf8', 'ignore')
                fw.close()
                words = re.split('[|]', text)
                for word in words:
                    if word == aimWord:
                        num += 1
                        break
            else:
                newText = textDeconstruct.deconstruct(key)
                words = re.split('[|]', newText['text'])
                for word in words:
                    if word == aimWord:
                        num += 1
                        break
    return num
Пример #3
0
def wordList(path):
    wordList = []
    vectorsList = deconstruct.de()
    stopwords = [line.strip() for line in open('./stopwords_master/final_stopwords.txt', encoding='UTF-8').readlines()]
    for key in vectorsList.keys():
        if '人民日报训练集' in key:
            fw = open(key, 'rb')
            text = fw.read()
            try:
                text = text.decode('utf8')
            except Exception:
                text = text.decode('utf8', 'ignore')
            fw.close()
            words = re.split('[|]', text)
            for word in words:
                if word not in stopwords and  not word.isdigit() and '%' not in word and '.' not in word:
                    if word not in wordList and len(word) > 1:
                        wordList.append(word)
        else:
            newText = textDeconstruct.deconstruct(key)
            words = re.split('[|]', newText['text'])
            for word in words:
                if word not in stopwords and  not word.isdigit() and '%' not in word and '.' not in word:
                    if word not in wordList and len(word) > 1:
                        wordList.append(word)

    fw = open(path, 'w', 1, 'utf-8')
    fw.write('\n'.join(wordList))
    fw.close()
Пример #4
0
def calculate():
    # 构建字典
    sen_dic = getSenDic()
    not_list = getNotList()
    degree_dic = getDegreeDic()

    cleanfile("resourceAnalysed.txt")
    rootdir = '.\\ResourceSorted'
    list = os.listdir(rootdir)
    temp = []
    for i in range(0,len(list)):
        temp.append(os.path.join(rootdir,list[i]))
    for i in temp:
        list = os.listdir(i)
        for j in range(len(list)):
            corpus_path = os.path.join(i,list[j])
            catelist = os.listdir(corpus_path)
            for mydir in catelist:
                class_path = corpus_path  + "\\"+ mydir
                content = textDeconstruct.deconstruct(class_path)['text'].split('|')

            # 计算情绪权值
                result = 0
                for m in range(1,len(content)):
                    if content[m] in sen_dic.keys():
                        if content[m-1] in not_list:
                            result += (sen_dic[content[m]] * -1)
                        elif content[m-1] in degree_dic.keys():
                            result += (sen_dic[content[m]] * (degree_dic[content[m-1]]))
                        else:
                            result += sen_dic[content[m]]
                result = str(result)
                savefile("resourceAnalysed.txt",'('+class_path + ',' + result + ')' + '\n')
Пример #5
0
def DFcou():
    DF = {}
    path = './分词对照集/'
    os.getcwd()
    List = os.listdir(path)
    for item in List:
        childPath = path+item+'/'
        fileList = os.listdir(childPath)
        for file in fileList:
            if item == '人民日报训练集':
                fw = open(childPath + file, 'rb')
                text = fw.read()
                try:
                    text = text.decode('utf8')
                except Exception:
                    text = text.decode('utf8', 'ignore')
                fw.close()
                words = re.split('[|]',text)
                wordList = []
                for word in words:
                    if word not in wordList:
                        wordList.append(word)

                for word in wordList:
                    if word in DF:
                        DF[word] += 1
                    else:
                        DF[word] = 1
            else:
                newText = textDeconstruct.deconstruct(childPath+file)
                words = re.split('[|]', newText['text'])
                wordList = []
                for word in words:
                    if word not in wordList:
                        wordList.append(word)

                for word in wordList:
                    if word in DF:
                        DF[word] += 1
                    else:
                        DF[word] = 1
    DF = sorted(DF.items(), key=lambda item:item[1], reverse=True)
    return DF
Пример #6
0
def totalT(class_):
    num = 0
    vectorsList = deconstruct.de()
    for key in vectorsList.keys():
        if class_ == 0 or class_ == vectorsList[key]:
            if '人民日报' in key:
                fw = open(key, 'rb')
                text = fw.read()
                try:
                    text = text.decode('utf8')
                except Exception:
                    text = text.decode('utf8', 'ignore')
                fw.close()
                words = re.split('[|]', text)
                num += len(words)
            else:
                newText = textDeconstruct.deconstruct(key)
                words = re.split('[|]', newText['text'])
                num += len(words)
    return num
            os.makedirs(monthSave)
        for file in fileList:
            vector = [
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
            ]
            filePath = monthPath + file
            fileSave = monthSave + file
            newText = {}
            text = ''
            if '人民日报' in media:
                fw = open(filePath, 'rb')
                text = fw.read()
                fw.close()
                text = text.decode('utf8', 'ignore')
            else:
                newText = textDeconstruct.deconstruct(filePath)
                text = newText['text']

            words = re.split('[|]', text)
            for i in range(len(eigenWord)):
                if eigenWord[i] in words:
                    vector[i] = weight[i]
            y = clf.predict([vector])
            if y[0] == 1.0:
                fw = open(fileSave, 'w', 1, 'utf8')
                if '人民日报' in media:
                    fw.write(text)
                else:
                    fw.write('<source>' + newText['source'] + '</source>' +
                             '\n<title>' + newText['title'] + '</title>' +
                             '\n<time>' + newText['time'] + '</time>' +