Exemplo n.º 1
0
    def train(self, train_set, train_labels):
        """
        Train classifier
        :param train_set: numpy array with all documents in training set
        :param train_labels: numpy array with labels in training set
        :return: trained classifier
        """
        # Transform dataset, obtaining the count of every word in vocabulary and performing tfidf conversion
        train_counts = self.count_vect.fit_transform(train_set)
        train_tfidf = self.tfidf_transformer.fit_transform(train_counts)

        # Build inverse vocabulary with all words in dictionary (needed to recover the word from the index
        # in some feature functions)
        self.inv_vocab = {v: k for k, v in self.count_vect.vocabulary_.items()}

        # If using feature representation, obtain the corresponding feature matrix and append
        if self.with_feat:
            matrix = self.build_feature_matrix(train_counts)
            matrix_norm = Normalizer().fit(matrix).transform(matrix)
            train_tfidf = csr_matrix(
                np.concatenate((train_tfidf.toarray(), matrix_norm.toarray()),
                               axis=1))

        # If using bigram representation, obtain the top 100 bigrams and append
        if self.with_bigram:
            bigram_counts = self.bigram_vect.fit_transform(train_set)
            bigram_tfidf = self.tfidf_bigram.fit_transform(bigram_counts)
            train_tfidf = csr_matrix(
                np.concatenate((train_tfidf.toarray(), bigram_tfidf.toarray()),
                               axis=1))

        # Return trained classifier
        return self.classifier.fit(train_tfidf, train_labels)
Exemplo n.º 2
0
    def test(self, clf, test_set):
        """
        Test classifier
        :param clf: scikit-learn object containing a trained classifier with the desired algorithm
        :param test_set: numpy array with all documents in test set
        :return: numpy array with the predictions
        """
        # Transform test set the same way the training set is transformed
        test_counts = self.count_vect.transform(test_set)
        test_tfidf = self.tfidf_transformer.transform(test_counts)

        # If using feature representation, obtain the corresponding feature matrix and append
        if self.with_feat:
            test_matrix = self.build_feature_matrix(test_counts)
            test_matrix_norm = Normalizer().fit(test_matrix).transform(
                test_matrix)
            test_tfidf = csr_matrix(
                np.concatenate(
                    (test_tfidf.toarray(), test_matrix_norm.toarray()),
                    axis=1))

        # If using feature representation, obtain the corresponding top 100 bigrams and append
        if self.with_bigram:
            bigram_counts = self.bigram_vect.fit_transform(test_set)
            bigram_tfidf = self.tfidf_bigram.fit_transform(bigram_counts)
            test_tfidf = csr_matrix(
                np.concatenate((test_tfidf.toarray(), bigram_tfidf.toarray()),
                               axis=1))

        # Return predictions
        return clf.predict(test_tfidf)
print

# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(data_set.data)
X = Normalizer(norm="l2", copy=False).transform(X)

y = data_set.target

# feature selection
ch2 = SelectKBest(chi2, k = 1800)
X = ch2.fit_transform(X, y)

X = X.toarray()

n_samples, n_features = X.shape
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
print


###############################################################################
# Test a classifier using K-fold Cross Validation

# Setup 10 fold cross validation
num_fold = 10
kf = KFold(n_samples, k=num_fold, indices=True)

# Note: NBs are not working
X = vectorizer.fit_transform(data_set.data)
X = Normalizer(norm="l2", copy=False).transform(X)

y = data_set.target

# # Feature selection
# select_chi2 = 1900
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X = ch2.fit_transform(X, y)
# print "Done in %fs" % (time() - t0)
# print "L1:      n_samples: %d, n_features: %d" % X.shape
# print

X_den = X.toarray()

n_samples, n_features = X.shape
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
print


###############################################################################
# Setup part
# 
# Notation:
# N: number for training examples; K: number of models in level 0
# X: feature matrix; y: result array; z_k: prediction result array for k's model
# 
Exemplo n.º 5
0
#adjusted rand-index: function that measures the similarity of the two assignments, ignoring permutations
print("Adjusted Rand-Index: %.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))

#silhouette coefficient: a higher score relates to a model with better defined clusters
print("Silhouette Coefficient: %0.3f" %
      metrics.silhouette_score(X, labels, sample_size=1000))

###############################################################################
# Visualize the results on PCA-reduced data

if (opts.print_visualization):
    np.random.seed(42)
    sample_size = 300

    data = X.toarray()
    n_digits = source_num
    n_samples, n_features = data.shape

    reduced_data = PCA(n_components=2).fit_transform(data)
    kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
    kmeans.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02  # point in the mesh [x_min, m_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
    y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
Exemplo n.º 6
0
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))

#adjusted rand-index: function that measures the similarity of the two assignments, ignoring permutations
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))

#silhouette coefficient: a higher score relates to a model with better defined clusters
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000))

###############################################################################
# Visualize the results on PCA-reduced data

if(opts.print_visualization):
    np.random.seed(42)
    sample_size = 300

    data = X.toarray()
    n_digits = source_num
    n_samples, n_features = data.shape

    reduced_data = PCA(n_components=2).fit_transform(data)
    kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
    kmeans.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
    y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Obtain labels for each point in mesh. Use last trained model.
Exemplo n.º 7
0
print len(data_train.data)
print len(data_test.data)
print

# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()

vectorizer = Vectorizer(max_features=10000)
X_test = vectorizer.fit_transform(data_test.data)
X_test = Normalizer(norm="l2", copy=False).transform(X_test)

X = vectorizer.transform(data_train.data)
X = Normalizer(norm="l2", copy=False).transform(X)

X = X.toarray()
X_test = X_test.toarray()

n_samples, n_features = X.shape
test_samples, test_features = X_test.shape
print "done in %fs" % (time() - t0)
print "Train set - n_samples: %d, n_features: %d" % (n_samples, n_features)
print "Test set  - n_samples: %d, n_features: %d" % (test_samples, test_features)
print


# fit the model
# when nu=0.01, gamma=0.0034607 is the smallest to generate >0 result
clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.05) 
clf.fit(X)
# predit on X_test
Exemplo n.º 8
0
    def run(self):
        self.texts = []
        self.signals.UpdateProgressBar.emit(0)
        xs = []
        ys = []
        similarity = []

        output_dir = self.configurations.get(
            "output_files_directory", "output_files") + "/preprocessing/"

        need_full_preprocessing = self.configurations.get(
            "need_full_preprocessing", True)
        if self.first_start == True:
            if need_full_preprocessing:
                for filename in self.filenames:
                    text = TextData(filename)
                    text.readSentencesFromInputText()
                    self.texts.append(text)

                self.signals.PrintInfo.emit('Токенизация...')
                self.texts = tokenizeTextData(self.texts, self.configurations)

                self.signals.UpdateProgressBar.emit(10)

                self.signals.PrintInfo.emit('Удаление стоп-слов...')
                self.texts, log_string = removeStopWordsInTexts(
                    self.texts, self.morph, self.configurations)
                writeStringToFile(log_string.replace('\n ', '\n'),
                                  output_dir + 'output_stage_1.txt')
                self.signals.UpdateProgressBar.emit(15)

                self.signals.PrintInfo.emit('Приведение к нормальной форме...')
                self.texts, log_string = normalizeTexts(self.texts, self.morph)
                writeStringToFile(log_string.replace('\n ', '\n'),
                                  output_dir + 'output_stage_2.txt')
                self.signals.UpdateProgressBar.emit(25)

                self.signals.PrintInfo.emit('Приведение регистра...')
                self.texts, log_string = fixRegisterInTexts(
                    self.texts, self.morph)
                writeStringToFile(log_string.replace('\n ', '\n'),
                                  output_dir + 'output_stage_3.txt')
                self.signals.UpdateProgressBar.emit(30)

                if self.configurations.get("need_apriori", False):
                    self.signals.PrintInfo.emit('Рассчет Apriori...')
                    makeAprioriForTexts(self.texts, output_dir)

                self.signals.PrintInfo.emit('...')
                for text in self.texts:
                    self.input_texts.append(
                        getCompiledFromSentencesText(
                            text.register_pass_centences))
            else:
                for filename in self.filenames:
                    self.input_texts.append(readFullTextInputText(filename))
        else:
            self.signals.PrintInfo.emit(
                'Использование предыдущих результатов предварительной обработки'
            )

        self.signals.UpdateProgressBar.emit(40)

        self.first_start = False

        if len(self.input_texts) < 3:
            self.signals.PrintInfo.emit(
                'Недостаточно документов для корректного анализа!')
        else:
            # Добавим русские стоп-слова
            russian_stop_words = []
            with open(stop_words_filename) as f:
                russian_stop_words = f.readlines()
            russian_stop_words = [x.strip() for x in russian_stop_words]

            self.signals.UpdateProgressBar.emit(45)

            vectorizer = CountVectorizer(min_df=1,
                                         stop_words=russian_stop_words)
            dtm = vectorizer.fit_transform(self.input_texts)

            pre_svd_matrix = pd.DataFrame(
                dtm.toarray(),
                index=self.short_filenames,
                columns=vectorizer.get_feature_names()).head(10)
            pre_svd_matrix_filename = self.output_dir + 'pre_svd_matrix.csv'
            pre_svd_matrix.to_csv(pre_svd_matrix_filename,
                                  sep=";",
                                  decimal=',')
            self.signals.PrintInfo.emit(
                'Файл с матрицей [слова * документы] для ЛСА:' +
                pre_svd_matrix_filename)
            features_count = len(vectorizer.get_feature_names())
            self.signals.PrintInfo.emit('Уникальных слов:' +
                                        str(features_count))

            self.signals.UpdateProgressBar.emit(50)

            max_component = min(len(self.input_texts), features_count)

            # Производим ЛСА и сжимаем пространство до 2-мерного
            if max_component <= self.lsa_components_count:
                self.signals.PrintInfo.emit(
                    'Внимание! Число компонент уменьшено с ' +
                    str(self.lsa_components_count) + ' до ' +
                    str(max_component - 1))
                self.lsa_components_count = max_component - 1

            # dtm_lsa = svds(dtm_lsa, k=self.lsa_components_count)

            lsa = TruncatedSVD(self.lsa_components_count, algorithm='arpack')

            dtm = csc_matrix(dtm, dtype=float)
            dtm_lsa = lsa.fit_transform(dtm)

            dtm_lsa = csc_matrix(dtm_lsa, dtype=float)
            dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
            self.signals.UpdateProgressBar.emit(70)

            dtm_lsa = np.array(dtm_lsa.toarray())

            xs = [w[0] for w in dtm_lsa]
            ys = [w[1] for w in dtm_lsa]

            columns = ['Filename']
            if len(dtm_lsa) > 0:
                for column_index in range(len(dtm_lsa[0])):
                    columns.append('Component_' + str(column_index + 1))

            docs_weight_df = pd.DataFrame(columns=columns, index=None)
            docs_weight_df[columns[0]] = self.short_filenames
            for column_index in range(1, len(columns)):
                docs_weight_df[columns[column_index]] = [
                    w[column_index - 1] for w in dtm_lsa
                ]
            documents_weight_filename = self.output_dir + 'documents_weight.csv'
            docs_weight_df.to_csv(documents_weight_filename,
                                  sep=";",
                                  decimal=',')
            self.signals.PrintInfo.emit('Файл с весами документов:' +
                                        documents_weight_filename)

            self.signals.UpdateProgressBar.emit(90)

            # Вычислим таблицу соответствия докуметов
            similarity = np.asarray(
                numpy.asmatrix(dtm_lsa) * numpy.asmatrix(dtm_lsa).T)
            relationsTable = pd.DataFrame(similarity,
                                          index=self.short_filenames,
                                          columns=self.short_filenames).head(
                                              len(self.short_filenames))

            relation_table_filename = self.output_dir + 'document_relation_table.csv'
            relationsTable.to_csv(relation_table_filename,
                                  sep=";",
                                  decimal=',')
            self.signals.PrintInfo.emit(
                'Файл с таблицей отношений документов:' +
                relation_table_filename)

        self.signals.PrintInfo.emit('Рассчеты закончены!')
        self.signals.UpdateProgressBar.emit(100)
        self.signals.Finished.emit(xs, ys, similarity, self.short_filenames)