def makeVectors(): vectorsList = deconstruct.de() fw = open('./weightVector.txt','rb') text = fw.read() text = text.decode('utf8','ignore') fw.close() text = text.split(',') weight = [float(x) for x in text] fw = open('./eigenWord.txt','rb') text = fw.read() text = text.decode('utf8','ignore') fw.close() words = text.split('\r\n') result = [] for key in vectorsList.keys(): vector = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] vector[20] = float(vectorsList[key]) if '人民日报' in key: fw = open(key, 'rb') text = fw.read() text = text.decode('utf8', 'ignore') fw.close() else: newText = textDeconstruct.deconstruct(key) text = newText['text'] text = re.split('[|]', text) for i in range(len(words)): if words[i] in text: vector[i] = weight[i] vectorsList[key] = vector result.append('({0},{1})'.format(key, str(vector))) fw = open('vectors.txt', 'w', 1, 'utf8') fw.write('\n'.join(result)) fw.close()
def aimD(aimWord,class_): num = 0 vectorsList = deconstruct.de() for key in vectorsList.keys(): if class_ == 0 or class_ == vectorsList[key]: if '人民日报' in key: fw = open(key, 'rb') text = fw.read() try: text = text.decode('utf8') except Exception: text = text.decode('utf8', 'ignore') fw.close() words = re.split('[|]', text) for word in words: if word == aimWord: num += 1 break else: newText = textDeconstruct.deconstruct(key) words = re.split('[|]', newText['text']) for word in words: if word == aimWord: num += 1 break return num
def wordList(path): wordList = [] vectorsList = deconstruct.de() stopwords = [line.strip() for line in open('./stopwords_master/final_stopwords.txt', encoding='UTF-8').readlines()] for key in vectorsList.keys(): if '人民日报训练集' in key: fw = open(key, 'rb') text = fw.read() try: text = text.decode('utf8') except Exception: text = text.decode('utf8', 'ignore') fw.close() words = re.split('[|]', text) for word in words: if word not in stopwords and not word.isdigit() and '%' not in word and '.' not in word: if word not in wordList and len(word) > 1: wordList.append(word) else: newText = textDeconstruct.deconstruct(key) words = re.split('[|]', newText['text']) for word in words: if word not in stopwords and not word.isdigit() and '%' not in word and '.' not in word: if word not in wordList and len(word) > 1: wordList.append(word) fw = open(path, 'w', 1, 'utf-8') fw.write('\n'.join(wordList)) fw.close()
def calculate(): # 构建字典 sen_dic = getSenDic() not_list = getNotList() degree_dic = getDegreeDic() cleanfile("resourceAnalysed.txt") rootdir = '.\\ResourceSorted' list = os.listdir(rootdir) temp = [] for i in range(0,len(list)): temp.append(os.path.join(rootdir,list[i])) for i in temp: list = os.listdir(i) for j in range(len(list)): corpus_path = os.path.join(i,list[j]) catelist = os.listdir(corpus_path) for mydir in catelist: class_path = corpus_path + "\\"+ mydir content = textDeconstruct.deconstruct(class_path)['text'].split('|') # 计算情绪权值 result = 0 for m in range(1,len(content)): if content[m] in sen_dic.keys(): if content[m-1] in not_list: result += (sen_dic[content[m]] * -1) elif content[m-1] in degree_dic.keys(): result += (sen_dic[content[m]] * (degree_dic[content[m-1]])) else: result += sen_dic[content[m]] result = str(result) savefile("resourceAnalysed.txt",'('+class_path + ',' + result + ')' + '\n')
def DFcou(): DF = {} path = './分词对照集/' os.getcwd() List = os.listdir(path) for item in List: childPath = path+item+'/' fileList = os.listdir(childPath) for file in fileList: if item == '人民日报训练集': fw = open(childPath + file, 'rb') text = fw.read() try: text = text.decode('utf8') except Exception: text = text.decode('utf8', 'ignore') fw.close() words = re.split('[|]',text) wordList = [] for word in words: if word not in wordList: wordList.append(word) for word in wordList: if word in DF: DF[word] += 1 else: DF[word] = 1 else: newText = textDeconstruct.deconstruct(childPath+file) words = re.split('[|]', newText['text']) wordList = [] for word in words: if word not in wordList: wordList.append(word) for word in wordList: if word in DF: DF[word] += 1 else: DF[word] = 1 DF = sorted(DF.items(), key=lambda item:item[1], reverse=True) return DF
def totalT(class_): num = 0 vectorsList = deconstruct.de() for key in vectorsList.keys(): if class_ == 0 or class_ == vectorsList[key]: if '人民日报' in key: fw = open(key, 'rb') text = fw.read() try: text = text.decode('utf8') except Exception: text = text.decode('utf8', 'ignore') fw.close() words = re.split('[|]', text) num += len(words) else: newText = textDeconstruct.deconstruct(key) words = re.split('[|]', newText['text']) num += len(words) return num
os.makedirs(monthSave) for file in fileList: vector = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] filePath = monthPath + file fileSave = monthSave + file newText = {} text = '' if '人民日报' in media: fw = open(filePath, 'rb') text = fw.read() fw.close() text = text.decode('utf8', 'ignore') else: newText = textDeconstruct.deconstruct(filePath) text = newText['text'] words = re.split('[|]', text) for i in range(len(eigenWord)): if eigenWord[i] in words: vector[i] = weight[i] y = clf.predict([vector]) if y[0] == 1.0: fw = open(fileSave, 'w', 1, 'utf8') if '人民日报' in media: fw.write(text) else: fw.write('<source>' + newText['source'] + '</source>' + '\n<title>' + newText['title'] + '</title>' + '\n<time>' + newText['time'] + '</time>' +