예제 #1
0
    def on_Vectorize_clicked(self):
        vectorSize = int(self.spinBoxVecSize.text())
        if len(self.listRawSents) == 0:
            self.labelLog.setText('Chưa có câu.')
        else:
            linkFolder = 'outfile/{0}'.format(self.fileInitName)
            make_folder.create_folder(linkFolder)
            linkModel = linkFolder + '/word2vec.model'
            # token sentences --> to list
            for sent in self.listRawSents:
                tokens = word_tokenize(sent, format='text').split()
                words = []
                for token in tokens:
                    if re.match(r'^\w+', token):
                        words.append(token)
                self.listSentToWord.append(words)
            # training word2vec cho cái list tách từ.
            model = Word2Vec(self.listSentToWord, size=vectorSize, min_count=1)
            model.save(linkModel)
            sent2vec = Sentence2Vec(linkModel)
            listVect = []
            for sent in self.listRawSents:
                listVect.append(sent2vec.get_vector(sent).tolist())
            write_file.list_to_txt(listVect, linkFolder, 'Sent2Vect.txt')
            write_file.list_to_txt(self.listSentToWord, linkFolder, 'WordTokenize.txt')

            self.labelLog.setText('Đã lưu file vector.')
            self.checkVectorize = 1
            self.groupBoxKmeans.setEnabled(True)
예제 #2
0
def get_SR(inputDict: dict, FISDir, FISName, minconf: float, minsupp: float):
    frequentItemSet = read_file.read_lines_to_list(FISDir, FISName, ', ')
    (ruleList,
     topTenRules) = Strong_rules.get_all_strong_rule_2(inputDict,
                                                       frequentItemSet,
                                                       minconf)
    write_file.list_to_txt(
        ruleList, './RESULT/Strong_rule_minsupp_' + str(minsupp) +
        ' minconf_' + str(minconf) + '/', 'Rule_' +
        FISName) if ruleList else print('Không có luật thỏa mãn.')
    write_file.list_to_txt(
        topTenRules, './RESULT/Topten_Strong_rule_minsupp_' + str(minsupp) +
        ' minconf_' + str(minconf) + '/', '10_Rules_' +
        FISName) if topTenRules else print('Không có luật thỏa mãn.')
예제 #3
0
 def write_outfile(self):
     write_file.list_to_txt(self.listSentToWord, 'TFIDF',
                            'Word_Tokenize.txt')
     write_file.dict_to_txt(self.dictWordAtSent, 'TFIDF',
                            'Index_of_word.txt')
     write_file.list_to_txt(self.listTF, 'TFIDF', 'TF.txt')
     write_file.dict_to_txt(self.dictIDF, 'TFIDF', 'IDF.txt')
     write_file.list_to_txt(self.listTFIDF, 'TFIDF', 'TFIDFs.txt')
예제 #4
0
def main():
    # fileName = 'giao_thong.txt'
    fileName = 'corpus_che.txt'
    listRawSentence = read_file.read_line_to_sentenceList(
        './datasets/', fileName, '\n')
    # t = TFIDF(listRawSentence, fileName)
    # t.write_outfile()

    listSentToWord = [
        word_tokenize(sent, format='text') for sent in listRawSentence
    ]
    # listSentToWord = [word_tokenize(sent) for sent in listRawSentence]
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer(lowercase=False, min_df=100)
    tfidf_matrix = tfidf.fit_transform(listSentToWord)
    features = tfidf.get_feature_names()
    listStopWords = []
    print(min(tfidf.idf_), max(tfidf.idf_), len(features))
    for (index, feature) in enumerate(features):
        if tfidf.idf_[index] <= (max(tfidf.idf_) - 1):
            listStopWords.append(feature)
    write_file.list_to_txt(listStopWords, './TFIDF/',
                           'stopword_withSKlearn.txt')
예제 #5
0
def main():
    ### chuyen train_vn tu co label sang không label.
    # a = read_lines_to_list('./datasets/', 'train_vn.txt', ' ')
    # for ia in a:
    #     del ia[-1]
    #     for num in range(len(ia)):
    #         ia[num] = float(ia[num])
    # write_file.list_to_txt(a, './datasets/', 'train_vn_nonLabel.vector')
    # b = read_file.read_lines_to_floatlist('./datasets/', 'train_vn_nonLabel.vector', ', ')
    # c = read_file.read_lines_to_floatlist_nonSquareBracklets('./datasets/', 'train_vn_nonLabel.vector', ', ')
    ### gộp file
    # linkFolder = './datasets/giao-thong/'
    # listDocName = os.listdir(linkFolder)
    # listAllDoc = []
    # for fileName in listDocName:
    #     listAllDoc.append(read_file.read_line_to_sentenceList(linkFolder, fileName, '\n')[1:])
    # listDocToSent = []
    # for doc in listAllDoc:
    #     for clusText in doc:
    #         listDocToSent += underthesea.sent_tokenize(clusText)
    # write_file.list_to_txt(listDocToSent, './datasets/', 'giao_thong.txt')
    ### tach giao_thong theo cluster
    fileName = 'giao_thong.txt'
    nameclus0 = 'giaothongclus0.clus'
    nameclus1 = 'giaothongclus1.clus'
    nameclus2 = 'giaothongclus2.clus'
    clus0 = read_file.read_lines_to_intlist_nonSquareBracklets(
        './SomeExample', nameclus0, ' ')[0]
    clus1 = read_file.read_lines_to_intlist_nonSquareBracklets(
        './SomeExample', nameclus1, ' ')[0]
    clus2 = read_file.read_lines_to_intlist_nonSquareBracklets(
        './SomeExample', nameclus2, ' ')[0]
    listRawText = read_file.read_line_to_sentenceList('./SomeExample/',
                                                      fileName, '\n')
    listRawClus0 = []
    listRawClus1 = []
    listRawClus2 = []
    for iSent in range(len(listRawText)):
        if iSent in clus0:
            listRawClus0.append(listRawText[iSent])
        elif iSent in clus1:
            listRawClus1.append(listRawText[iSent])
        elif iSent in clus2:
            listRawClus2.append(listRawText[iSent])
    write_file.list_to_txt(listRawClus0, 'SomeExample', 'giao_thong_clus0.txt')
    write_file.list_to_txt(listRawClus1, 'SomeExample', 'giao_thong_clus1.txt')
    write_file.list_to_txt(listRawClus2, 'SomeExample', 'giao_thong_clus2.txt')
# linkFolder = './SomeExample/1/'
linkFolder = './SomeExample/2/'
fileMergeName = 'giao_thong'
listDocName = os.listdir(linkFolder)
listAllDoc = []
listLabelStr = ''
negSent = []
posSent = []
neuSent = []
for fileName in listDocName:
    fileType = fileName[-6:]
    print(fileType)
    if fileType == '.clus0':
        negSent = read_file.read_line_to_sentenceList(linkFolder, fileName,
                                                      '\n')
        for _ in range(len(negSent)):
            listLabelStr += '0, '
    elif fileType == '.clus1':
        posSent = read_file.read_line_to_sentenceList(linkFolder, fileName,
                                                      '\n')
        for _ in range(len(posSent)):
            listLabelStr += '1, '
    elif fileType == '.clus2':
        neuSent = read_file.read_line_to_sentenceList(linkFolder, fileName,
                                                      '\n')
        for _ in range(len(neuSent)):
            listLabelStr += '2, '
listAllDoc += negSent + posSent + neuSent
write_file.list_to_txt(listAllDoc, linkDatasets, fileMergeName + '.txt')
with open(linkDatasets + fileMergeName + '.lb', 'w', encoding='utf-8') as f:
    f.write(listLabelStr)