def on_Vectorize_clicked(self): vectorSize = int(self.spinBoxVecSize.text()) if len(self.listRawSents) == 0: self.labelLog.setText('Chưa có câu.') else: linkFolder = 'outfile/{0}'.format(self.fileInitName) make_folder.create_folder(linkFolder) linkModel = linkFolder + '/word2vec.model' # token sentences --> to list for sent in self.listRawSents: tokens = word_tokenize(sent, format='text').split() words = [] for token in tokens: if re.match(r'^\w+', token): words.append(token) self.listSentToWord.append(words) # training word2vec cho cái list tách từ. model = Word2Vec(self.listSentToWord, size=vectorSize, min_count=1) model.save(linkModel) sent2vec = Sentence2Vec(linkModel) listVect = [] for sent in self.listRawSents: listVect.append(sent2vec.get_vector(sent).tolist()) write_file.list_to_txt(listVect, linkFolder, 'Sent2Vect.txt') write_file.list_to_txt(self.listSentToWord, linkFolder, 'WordTokenize.txt') self.labelLog.setText('Đã lưu file vector.') self.checkVectorize = 1 self.groupBoxKmeans.setEnabled(True)
def get_SR(inputDict: dict, FISDir, FISName, minconf: float, minsupp: float): frequentItemSet = read_file.read_lines_to_list(FISDir, FISName, ', ') (ruleList, topTenRules) = Strong_rules.get_all_strong_rule_2(inputDict, frequentItemSet, minconf) write_file.list_to_txt( ruleList, './RESULT/Strong_rule_minsupp_' + str(minsupp) + ' minconf_' + str(minconf) + '/', 'Rule_' + FISName) if ruleList else print('Không có luật thỏa mãn.') write_file.list_to_txt( topTenRules, './RESULT/Topten_Strong_rule_minsupp_' + str(minsupp) + ' minconf_' + str(minconf) + '/', '10_Rules_' + FISName) if topTenRules else print('Không có luật thỏa mãn.')
def write_outfile(self): write_file.list_to_txt(self.listSentToWord, 'TFIDF', 'Word_Tokenize.txt') write_file.dict_to_txt(self.dictWordAtSent, 'TFIDF', 'Index_of_word.txt') write_file.list_to_txt(self.listTF, 'TFIDF', 'TF.txt') write_file.dict_to_txt(self.dictIDF, 'TFIDF', 'IDF.txt') write_file.list_to_txt(self.listTFIDF, 'TFIDF', 'TFIDFs.txt')
def main(): # fileName = 'giao_thong.txt' fileName = 'corpus_che.txt' listRawSentence = read_file.read_line_to_sentenceList( './datasets/', fileName, '\n') # t = TFIDF(listRawSentence, fileName) # t.write_outfile() listSentToWord = [ word_tokenize(sent, format='text') for sent in listRawSentence ] # listSentToWord = [word_tokenize(sent) for sent in listRawSentence] from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(lowercase=False, min_df=100) tfidf_matrix = tfidf.fit_transform(listSentToWord) features = tfidf.get_feature_names() listStopWords = [] print(min(tfidf.idf_), max(tfidf.idf_), len(features)) for (index, feature) in enumerate(features): if tfidf.idf_[index] <= (max(tfidf.idf_) - 1): listStopWords.append(feature) write_file.list_to_txt(listStopWords, './TFIDF/', 'stopword_withSKlearn.txt')
def main(): ### chuyen train_vn tu co label sang không label. # a = read_lines_to_list('./datasets/', 'train_vn.txt', ' ') # for ia in a: # del ia[-1] # for num in range(len(ia)): # ia[num] = float(ia[num]) # write_file.list_to_txt(a, './datasets/', 'train_vn_nonLabel.vector') # b = read_file.read_lines_to_floatlist('./datasets/', 'train_vn_nonLabel.vector', ', ') # c = read_file.read_lines_to_floatlist_nonSquareBracklets('./datasets/', 'train_vn_nonLabel.vector', ', ') ### gộp file # linkFolder = './datasets/giao-thong/' # listDocName = os.listdir(linkFolder) # listAllDoc = [] # for fileName in listDocName: # listAllDoc.append(read_file.read_line_to_sentenceList(linkFolder, fileName, '\n')[1:]) # listDocToSent = [] # for doc in listAllDoc: # for clusText in doc: # listDocToSent += underthesea.sent_tokenize(clusText) # write_file.list_to_txt(listDocToSent, './datasets/', 'giao_thong.txt') ### tach giao_thong theo cluster fileName = 'giao_thong.txt' nameclus0 = 'giaothongclus0.clus' nameclus1 = 'giaothongclus1.clus' nameclus2 = 'giaothongclus2.clus' clus0 = read_file.read_lines_to_intlist_nonSquareBracklets( './SomeExample', nameclus0, ' ')[0] clus1 = read_file.read_lines_to_intlist_nonSquareBracklets( './SomeExample', nameclus1, ' ')[0] clus2 = read_file.read_lines_to_intlist_nonSquareBracklets( './SomeExample', nameclus2, ' ')[0] listRawText = read_file.read_line_to_sentenceList('./SomeExample/', fileName, '\n') listRawClus0 = [] listRawClus1 = [] listRawClus2 = [] for iSent in range(len(listRawText)): if iSent in clus0: listRawClus0.append(listRawText[iSent]) elif iSent in clus1: listRawClus1.append(listRawText[iSent]) elif iSent in clus2: listRawClus2.append(listRawText[iSent]) write_file.list_to_txt(listRawClus0, 'SomeExample', 'giao_thong_clus0.txt') write_file.list_to_txt(listRawClus1, 'SomeExample', 'giao_thong_clus1.txt') write_file.list_to_txt(listRawClus2, 'SomeExample', 'giao_thong_clus2.txt')
# linkFolder = './SomeExample/1/' linkFolder = './SomeExample/2/' fileMergeName = 'giao_thong' listDocName = os.listdir(linkFolder) listAllDoc = [] listLabelStr = '' negSent = [] posSent = [] neuSent = [] for fileName in listDocName: fileType = fileName[-6:] print(fileType) if fileType == '.clus0': negSent = read_file.read_line_to_sentenceList(linkFolder, fileName, '\n') for _ in range(len(negSent)): listLabelStr += '0, ' elif fileType == '.clus1': posSent = read_file.read_line_to_sentenceList(linkFolder, fileName, '\n') for _ in range(len(posSent)): listLabelStr += '1, ' elif fileType == '.clus2': neuSent = read_file.read_line_to_sentenceList(linkFolder, fileName, '\n') for _ in range(len(neuSent)): listLabelStr += '2, ' listAllDoc += negSent + posSent + neuSent write_file.list_to_txt(listAllDoc, linkDatasets, fileMergeName + '.txt') with open(linkDatasets + fileMergeName + '.lb', 'w', encoding='utf-8') as f: f.write(listLabelStr)