Пример #1
0
    def handle(self):
        """
        Process clustering of corpus documents.

        """

        index_db = IndexDB()
        self.connection = index_db.handler()
        documents = self.indexed_documents()
        total_docs = len(documents)
        # We generate one cluster for each 500 docs.
        num_clusters = round(total_docs / 500)

        # Load vectorize from dump or process documents vectorization
        try:
            vectorizer = joblib.load('vectorizer.pkl')
        except FileNotFoundError:
            matrix, vectorizer = self.documents_vectors()

        terms = vectorizer.get_feature_names()
        print("\nUsing %d features for clustering.\n" % (len(terms)))

        # Load cluster model from dump or process clustering.
        try:
            km = joblib.load('doc_cluster.pkl')
        except FileNotFoundError:
            km = KMeans(n_clusters=num_clusters,
                        n_init=5,
                        max_iter=100,
                        precompute_distances=True,
                        verbose=1)
            km.fit(matrix)

            # Save clusters and vectorizer.
            joblib.dump(km, 'doc_cluster.pkl')
            joblib.dump(vectorizer, 'vectorizer.pkl')

        clusters = km.labels_.tolist()
        centroids = km.cluster_centers_.argsort()[:, ::-1]
        frame = pandas.DataFrame(documents,
                                 index=[clusters],
                                 columns=['doc_id'])

        # Print report of clusters.
        for i in range(num_clusters):
            print(colored("\n\n====================================",
                          'yellow'))
            print(colored("Cluster %d:" % (i), 'yellow'), end='')
            for word_idx in centroids[i, 0:9]:
                word = terms[word_idx]
                print(colored(' %s' % (word), 'yellow'), end=',')
            print(
                colored("\n====================================\n\n",
                        'yellow'))

            print("Documents:")
            for doc_id in frame.ix[i]['doc_id'].values.tolist():
                print(' - %s' % (self.document_field_value(doc_id, 'body')))
                print("------------------------------------")
Пример #2
0
    def handle(self):
        """
        Process corpus documents indexation.

        """

        download('stopwords')
        indexdb = IndexDB()
        self.connection = indexdb.handler()
        data_dir = '/Users/pablocc/harvard_data/'
        counter = 0

        for filename in os.listdir(data_dir):
            if os.path.isdir(data_dir + filename) or filename[0] == '.':
                continue

            with open(data_dir + filename, 'rb') as fh:
                reader = MARCReader(fh)
                for record in reader:
                    document = self.prepare_record(record)
                    counter += 1
                    print("%s - processing document %s." %
                          (counter, document['id']))
                    self.index_document(document)
Пример #3
0
from indexdb import IndexDB
from math import log10
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sys import exit
from termcolor import colored
import numpy
import os
import pandas
import sqlite3

numpy.set_printoptions(threshold=numpy.nan)
index_db = IndexDB()
connection = index_db.handler()


def indexed_document_words(doc_id):
    """ Get indexed document words.

    :param str doc_id: The document ID.
    :returns: A list of document words.

    """

    print("Tokens for document '%s'" % (doc_id))
    # Get document words
    db = connection.cursor()
    db.execute('''SELECT word FROM documents_words WHERE id = ?''', (doc_id, ))
    result = db.fetchall()