Python IndexDB.handler примеры использования

Язык программирования: Python

Пространство имен/Пакет: indexdb

Класс/Тип: IndexDB

Метод/Функция: handler

Примеров на hotexamples.com: 3

Python IndexDB.handler - 3 примера найдено. Это лучшие примеры Python кода для indexdb.IndexDB.handler, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

IndexDB(4)

handler(3)

close(1)

create(1)

open(1)

reset(1)

Пример #1

Показать файл

Файл: cluster_corpus.py Проект: BluesparkLabs/guided-search

    def handle(self):
        """
        Process clustering of corpus documents.

        """

        index_db = IndexDB()
        self.connection = index_db.handler()
        documents = self.indexed_documents()
        total_docs = len(documents)
        # We generate one cluster for each 500 docs.
        num_clusters = round(total_docs / 500)

        # Load vectorize from dump or process documents vectorization
        try:
            vectorizer = joblib.load('vectorizer.pkl')
        except FileNotFoundError:
            matrix, vectorizer = self.documents_vectors()

        terms = vectorizer.get_feature_names()
        print("\nUsing %d features for clustering.\n" % (len(terms)))

        # Load cluster model from dump or process clustering.
        try:
            km = joblib.load('doc_cluster.pkl')
        except FileNotFoundError:
            km = KMeans(n_clusters=num_clusters,
                        n_init=5,
                        max_iter=100,
                        precompute_distances=True,
                        verbose=1)
            km.fit(matrix)

            # Save clusters and vectorizer.
            joblib.dump(km, 'doc_cluster.pkl')
            joblib.dump(vectorizer, 'vectorizer.pkl')

        clusters = km.labels_.tolist()
        centroids = km.cluster_centers_.argsort()[:, ::-1]
        frame = pandas.DataFrame(documents,
                                 index=[clusters],
                                 columns=['doc_id'])

        # Print report of clusters.
        for i in range(num_clusters):
            print(colored("\n\n====================================",
                          'yellow'))
            print(colored("Cluster %d:" % (i), 'yellow'), end='')
            for word_idx in centroids[i, 0:9]:
                word = terms[word_idx]
                print(colored(' %s' % (word), 'yellow'), end=',')
            print(
                colored("\n====================================\n\n",
                        'yellow'))

            print("Documents:")
            for doc_id in frame.ix[i]['doc_id'].values.tolist():
                print(' - %s' % (self.document_field_value(doc_id, 'body')))
                print("------------------------------------")

Пример #2

Показать файл

Файл: index_corpus.py Проект: BluesparkLabs/guided-search

    def handle(self):
        """
        Process corpus documents indexation.

        """

        download('stopwords')
        indexdb = IndexDB()
        self.connection = indexdb.handler()
        data_dir = '/Users/pablocc/harvard_data/'
        counter = 0

        for filename in os.listdir(data_dir):
            if os.path.isdir(data_dir + filename) or filename[0] == '.':
                continue

            with open(data_dir + filename, 'rb') as fh:
                reader = MARCReader(fh)
                for record in reader:
                    document = self.prepare_record(record)
                    counter += 1
                    print("%s - processing document %s." %
                          (counter, document['id']))
                    self.index_document(document)

Пример #3

Показать файл

Файл: cluster_corpus.py Проект: BluesparkLabs/guided-search

from indexdb import IndexDB
from math import log10
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sys import exit
from termcolor import colored
import numpy
import os
import pandas
import sqlite3

numpy.set_printoptions(threshold=numpy.nan)
index_db = IndexDB()
connection = index_db.handler()


def indexed_document_words(doc_id):
    """ Get indexed document words.

    :param str doc_id: The document ID.
    :returns: A list of document words.

    """

    print("Tokens for document '%s'" % (doc_id))
    # Get document words
    db = connection.cursor()
    db.execute('''SELECT word FROM documents_words WHERE id = ?''', (doc_id, ))
    result = db.fetchall()