def train(self, train_set, train_labels): """ Train classifier :param train_set: numpy array with all documents in training set :param train_labels: numpy array with labels in training set :return: trained classifier """ # Transform dataset, obtaining the count of every word in vocabulary and performing tfidf conversion train_counts = self.count_vect.fit_transform(train_set) train_tfidf = self.tfidf_transformer.fit_transform(train_counts) # Build inverse vocabulary with all words in dictionary (needed to recover the word from the index # in some feature functions) self.inv_vocab = {v: k for k, v in self.count_vect.vocabulary_.items()} # If using feature representation, obtain the corresponding feature matrix and append if self.with_feat: matrix = self.build_feature_matrix(train_counts) matrix_norm = Normalizer().fit(matrix).transform(matrix) train_tfidf = csr_matrix( np.concatenate((train_tfidf.toarray(), matrix_norm.toarray()), axis=1)) # If using bigram representation, obtain the top 100 bigrams and append if self.with_bigram: bigram_counts = self.bigram_vect.fit_transform(train_set) bigram_tfidf = self.tfidf_bigram.fit_transform(bigram_counts) train_tfidf = csr_matrix( np.concatenate((train_tfidf.toarray(), bigram_tfidf.toarray()), axis=1)) # Return trained classifier return self.classifier.fit(train_tfidf, train_labels)
def test(self, clf, test_set): """ Test classifier :param clf: scikit-learn object containing a trained classifier with the desired algorithm :param test_set: numpy array with all documents in test set :return: numpy array with the predictions """ # Transform test set the same way the training set is transformed test_counts = self.count_vect.transform(test_set) test_tfidf = self.tfidf_transformer.transform(test_counts) # If using feature representation, obtain the corresponding feature matrix and append if self.with_feat: test_matrix = self.build_feature_matrix(test_counts) test_matrix_norm = Normalizer().fit(test_matrix).transform( test_matrix) test_tfidf = csr_matrix( np.concatenate( (test_tfidf.toarray(), test_matrix_norm.toarray()), axis=1)) # If using feature representation, obtain the corresponding top 100 bigrams and append if self.with_bigram: bigram_counts = self.bigram_vect.fit_transform(test_set) bigram_tfidf = self.tfidf_bigram.fit_transform(bigram_counts) test_tfidf = csr_matrix( np.concatenate((test_tfidf.toarray(), bigram_tfidf.toarray()), axis=1)) # Return predictions return clf.predict(test_tfidf)
print # Extract features print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform(data_set.data) X = Normalizer(norm="l2", copy=False).transform(X) y = data_set.target # feature selection ch2 = SelectKBest(chi2, k = 1800) X = ch2.fit_transform(X, y) X = X.toarray() n_samples, n_features = X.shape print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % (n_samples, n_features) print ############################################################################### # Test a classifier using K-fold Cross Validation # Setup 10 fold cross validation num_fold = 10 kf = KFold(n_samples, k=num_fold, indices=True) # Note: NBs are not working
X = vectorizer.fit_transform(data_set.data) X = Normalizer(norm="l2", copy=False).transform(X) y = data_set.target # # Feature selection # select_chi2 = 1900 # print ("Extracting %d best features by a chi-squared test" % select_chi2) # t0 = time() # ch2 = SelectKBest(chi2, k = select_chi2) # X = ch2.fit_transform(X, y) # print "Done in %fs" % (time() - t0) # print "L1: n_samples: %d, n_features: %d" % X.shape # print X_den = X.toarray() n_samples, n_features = X.shape print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % (n_samples, n_features) print ############################################################################### # Setup part # # Notation: # N: number for training examples; K: number of models in level 0 # X: feature matrix; y: result array; z_k: prediction result array for k's model #
#adjusted rand-index: function that measures the similarity of the two assignments, ignoring permutations print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) #silhouette coefficient: a higher score relates to a model with better defined clusters print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000)) ############################################################################### # Visualize the results on PCA-reduced data if (opts.print_visualization): np.random.seed(42) sample_size = 300 data = X.toarray() n_digits = source_num n_samples, n_features = data.shape reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) #adjusted rand-index: function that measures the similarity of the two assignments, ignoring permutations print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) #silhouette coefficient: a higher score relates to a model with better defined clusters print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000)) ############################################################################### # Visualize the results on PCA-reduced data if(opts.print_visualization): np.random.seed(42) sample_size = 300 data = X.toarray() n_digits = source_num n_samples, n_features = data.shape reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model.
print len(data_train.data) print len(data_test.data) print # Extract features print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X_test = vectorizer.fit_transform(data_test.data) X_test = Normalizer(norm="l2", copy=False).transform(X_test) X = vectorizer.transform(data_train.data) X = Normalizer(norm="l2", copy=False).transform(X) X = X.toarray() X_test = X_test.toarray() n_samples, n_features = X.shape test_samples, test_features = X_test.shape print "done in %fs" % (time() - t0) print "Train set - n_samples: %d, n_features: %d" % (n_samples, n_features) print "Test set - n_samples: %d, n_features: %d" % (test_samples, test_features) print # fit the model # when nu=0.01, gamma=0.0034607 is the smallest to generate >0 result clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.05) clf.fit(X) # predit on X_test
def run(self): self.texts = [] self.signals.UpdateProgressBar.emit(0) xs = [] ys = [] similarity = [] output_dir = self.configurations.get( "output_files_directory", "output_files") + "/preprocessing/" need_full_preprocessing = self.configurations.get( "need_full_preprocessing", True) if self.first_start == True: if need_full_preprocessing: for filename in self.filenames: text = TextData(filename) text.readSentencesFromInputText() self.texts.append(text) self.signals.PrintInfo.emit('Токенизация...') self.texts = tokenizeTextData(self.texts, self.configurations) self.signals.UpdateProgressBar.emit(10) self.signals.PrintInfo.emit('Удаление стоп-слов...') self.texts, log_string = removeStopWordsInTexts( self.texts, self.morph, self.configurations) writeStringToFile(log_string.replace('\n ', '\n'), output_dir + 'output_stage_1.txt') self.signals.UpdateProgressBar.emit(15) self.signals.PrintInfo.emit('Приведение к нормальной форме...') self.texts, log_string = normalizeTexts(self.texts, self.morph) writeStringToFile(log_string.replace('\n ', '\n'), output_dir + 'output_stage_2.txt') self.signals.UpdateProgressBar.emit(25) self.signals.PrintInfo.emit('Приведение регистра...') self.texts, log_string = fixRegisterInTexts( self.texts, self.morph) writeStringToFile(log_string.replace('\n ', '\n'), output_dir + 'output_stage_3.txt') self.signals.UpdateProgressBar.emit(30) if self.configurations.get("need_apriori", False): self.signals.PrintInfo.emit('Рассчет Apriori...') makeAprioriForTexts(self.texts, output_dir) self.signals.PrintInfo.emit('...') for text in self.texts: self.input_texts.append( getCompiledFromSentencesText( text.register_pass_centences)) else: for filename in self.filenames: self.input_texts.append(readFullTextInputText(filename)) else: self.signals.PrintInfo.emit( 'Использование предыдущих результатов предварительной обработки' ) self.signals.UpdateProgressBar.emit(40) self.first_start = False if len(self.input_texts) < 3: self.signals.PrintInfo.emit( 'Недостаточно документов для корректного анализа!') else: # Добавим русские стоп-слова russian_stop_words = [] with open(stop_words_filename) as f: russian_stop_words = f.readlines() russian_stop_words = [x.strip() for x in russian_stop_words] self.signals.UpdateProgressBar.emit(45) vectorizer = CountVectorizer(min_df=1, stop_words=russian_stop_words) dtm = vectorizer.fit_transform(self.input_texts) pre_svd_matrix = pd.DataFrame( dtm.toarray(), index=self.short_filenames, columns=vectorizer.get_feature_names()).head(10) pre_svd_matrix_filename = self.output_dir + 'pre_svd_matrix.csv' pre_svd_matrix.to_csv(pre_svd_matrix_filename, sep=";", decimal=',') self.signals.PrintInfo.emit( 'Файл с матрицей [слова * документы] для ЛСА:' + pre_svd_matrix_filename) features_count = len(vectorizer.get_feature_names()) self.signals.PrintInfo.emit('Уникальных слов:' + str(features_count)) self.signals.UpdateProgressBar.emit(50) max_component = min(len(self.input_texts), features_count) # Производим ЛСА и сжимаем пространство до 2-мерного if max_component <= self.lsa_components_count: self.signals.PrintInfo.emit( 'Внимание! Число компонент уменьшено с ' + str(self.lsa_components_count) + ' до ' + str(max_component - 1)) self.lsa_components_count = max_component - 1 # dtm_lsa = svds(dtm_lsa, k=self.lsa_components_count) lsa = TruncatedSVD(self.lsa_components_count, algorithm='arpack') dtm = csc_matrix(dtm, dtype=float) dtm_lsa = lsa.fit_transform(dtm) dtm_lsa = csc_matrix(dtm_lsa, dtype=float) dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa) self.signals.UpdateProgressBar.emit(70) dtm_lsa = np.array(dtm_lsa.toarray()) xs = [w[0] for w in dtm_lsa] ys = [w[1] for w in dtm_lsa] columns = ['Filename'] if len(dtm_lsa) > 0: for column_index in range(len(dtm_lsa[0])): columns.append('Component_' + str(column_index + 1)) docs_weight_df = pd.DataFrame(columns=columns, index=None) docs_weight_df[columns[0]] = self.short_filenames for column_index in range(1, len(columns)): docs_weight_df[columns[column_index]] = [ w[column_index - 1] for w in dtm_lsa ] documents_weight_filename = self.output_dir + 'documents_weight.csv' docs_weight_df.to_csv(documents_weight_filename, sep=";", decimal=',') self.signals.PrintInfo.emit('Файл с весами документов:' + documents_weight_filename) self.signals.UpdateProgressBar.emit(90) # Вычислим таблицу соответствия докуметов similarity = np.asarray( numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T) relationsTable = pd.DataFrame(similarity, index=self.short_filenames, columns=self.short_filenames).head( len(self.short_filenames)) relation_table_filename = self.output_dir + 'document_relation_table.csv' relationsTable.to_csv(relation_table_filename, sep=";", decimal=',') self.signals.PrintInfo.emit( 'Файл с таблицей отношений документов:' + relation_table_filename) self.signals.PrintInfo.emit('Рассчеты закончены!') self.signals.UpdateProgressBar.emit(100) self.signals.Finished.emit(xs, ys, similarity, self.short_filenames)