Exemplo n.º 1
0
    def build_index(self, remove_old=True):

        if remove_old:
            remove_directory(self.search_db_dir)

        self.__xappy = xappy.IndexerConnection(self.search_db_dir)

        self.__xappy.add_field_action("module_uid",
                                      xappy.FieldActions.STORE_CONTENT)

        self.__xappy.add_field_action("keyword_term",
                                      xappy.FieldActions.INDEX_FREETEXT,
                                      nopos=True)

        for module_keyword in self.__keywords:
            for keyword in module_keyword[2]:
                module_doc = xappy.UnprocessedDocument()

                module_doc.fields.append(xappy.Field("module_uid", keyword[0]))

                terms = list(split_word(keyword[1], True))
                module_doc.fields.append(
                    xappy.Field("keyword_term", ' '.join(terms)))

                self.__xappy.add(module_doc)

        self.__xappy.close()
Exemplo n.º 2
0
def indexer_connection(index_path=None):
    if not index_path:
        index_path = configure()
    indexer = xappy.IndexerConnection(index_path)

    # indexes
    indexer.add_field_action('searchable_text', xappy.FieldActions.INDEX_FREETEXT, nopos=True)
    indexer.add_field_action('author', xappy.FieldActions.INDEX_EXACT)
    #indexer.add_field_action('keywords', xappy.FieldActions.FACET)
    indexer.add_field_action('type', xappy.FieldActions.INDEX_EXACT)
    indexer.add_field_action('alpha', xappy.FieldActions.INDEX_EXACT)
    indexer.add_field_action('language', xappy.FieldActions.INDEX_EXACT)
    indexer.add_field_action('genre', xappy.FieldActions.INDEX_EXACT)
    indexer.add_field_action('sortable_title', xappy.FieldActions.SORTABLE)
    indexer.add_field_action('hidden', xappy.FieldActions.INDEX_EXACT)
    #indexer.add_field_action('modified', xappy.FieldActions.SORTABLE, type='data')

    # metadata
    indexer.add_field_action('title', xappy.FieldActions.STORE_CONTENT)
    indexer.add_field_action('alpha', xappy.FieldActions.STORE_CONTENT)
    indexer.add_field_action('language', xappy.FieldActions.STORE_CONTENT)
    indexer.add_field_action('genre', xappy.FieldActions.STORE_CONTENT)
    indexer.add_field_action('type', xappy.FieldActions.STORE_CONTENT)
    indexer.add_field_action('searchable_text', xappy.FieldActions.STORE_CONTENT)
    #indexer.add_field_action('description', xappy.FieldActions.STORE_CONTENT)
    #indexer.add_field_action('author', xappy.FieldActions.STORE_CONTENT)

    return indexer
Exemplo n.º 3
0
    def maybe_make_db(self):
        dbpath = self.dbpath()
        if not os.path.exists(dbpath):
            os.makedirs(dbpath)

            conn = xappy.IndexerConnection(dbpath)

            add_internal_field_actions(conn, self.stopwords, self.language)

            free_text_options = {
                'stop': self.stopwords,
                'spell': True,
                'language': self.language
            }

            conn.add_field_action('title', xappy.FieldActions.INDEX_FREETEXT,
                                  **free_text_options)
            conn.add_field_action('title', xappy.FieldActions.STORE_CONTENT)

            conn.add_field_action('content', xappy.FieldActions.INDEX_FREETEXT,
                                  **free_text_options)
            conn.add_field_action('content', xappy.FieldActions.STORE_CONTENT)

            conn.add_field_action("description",
                                  xappy.FieldActions.INDEX_FREETEXT,
                                  **free_text_options)
            conn.add_field_action("description",
                                  xappy.FieldActions.STORE_CONTENT)

            conn.add_field_action("keyword", xappy.FieldActions.INDEX_FREETEXT,
                                  **free_text_options)
            conn.add_field_action("keyword", xappy.FieldActions.STORE_CONTENT)

            conn.close()
Exemplo n.º 4
0
def index():
    """Index entire database."""
    indexer = xappy.IndexerConnection(config.search_db)
    indexer.add_field_action("name", xappy.FieldActions.INDEX_FREETEXT, spell=True)
    indexer.add_field_action("id", xappy.FieldActions.INDEX_EXACT)
    indexer.add_field_action("type", xappy.FieldActions.INDEX_EXACT)

    def add_to_index(data):
        doc = xappy.UnprocessedDocument()
        doc.id = data.id
        for k, v in data.items():
            doc.fields.append(xappy.Field(k, v))
        doc = indexer.process(doc)
        doc.data = data
        indexer.replace(doc)

    import db
    def add_table(table, id_prefix="", type=""):
        for d in db.getdb().select(table):
            d.id = id_prefix + d.id
            d.type = type
            if table == "constituency":
                d.id = d.state + "/" + d.id
            add_to_index(d)

    add_table("party", "party/", "party")
    add_table("state", type="state")
    add_table("constituency", type="constituency")
    add_table("candidate", "candidate/", type="candidate")

    indexer.flush()
    indexer.close()
Exemplo n.º 5
0
def	CreateIndex():

    connection = xappy.IndexerConnection('kis/lib/data')

    connection.add_field_action('kod', xappy.FieldActions.INDEX_EXACT)
    connection.add_field_action('name', xappy.FieldActions.INDEX_FREETEXT, language='ru')

    connection.close()
Exemplo n.º 6
0
def initdb():
    iconn = xappy.IndexerConnection(DBPATH)
    trash(iconn)
    iconn.add_field_action('name',
                           xappy.FieldActions.INDEX_FREETEXT,
                           spell=True)
    iconn.add_field_action('id', xappy.FieldActions.INDEX_FREETEXT)
    iconn.add_field_action('id', xappy.FieldActions.STORE_CONTENT)
    return iconn
Exemplo n.º 7
0
def	MakeIndex():

    connection = xappy.IndexerConnection('kis/lib/data')

    cursor = connections['default'].cursor()
    cursor.execute("SELECT rec_id,name FROM t_show_store_eisup_list;")
    data = cursor.fetchall()

    for item in data:
	doc = xappy.UnprocessedDocument()
	doc.fields.append(xappy.Field('kod',item[0].encode('utf-8')))
	doc.fields.append(xappy.Field('name',item[1].encode('utf-8')))
	connection.add(doc)

    connection.flush()
    connection.close()
Exemplo n.º 8
0
 def xappy_indexer_connection(self, path):
     conn = xappy.IndexerConnection(path)
     conn.add_field_action('body', xappy.FieldActions.INDEX_FREETEXT, language='en')
     if self.options.storebody:
         conn.add_field_action('body', xappy.FieldActions.STORE_CONTENT)
     conn.add_field_action('date', xappy.FieldActions.INDEX_EXACT)
     conn.add_field_action('date', xappy.FieldActions.STORE_CONTENT)
     conn.add_field_action('frm', xappy.FieldActions.INDEX_EXACT)
     conn.add_field_action('frm', xappy.FieldActions.STORE_CONTENT)
     conn.add_field_action('to', xappy.FieldActions.INDEX_EXACT)
     conn.add_field_action('to', xappy.FieldActions.STORE_CONTENT)
     conn.add_field_action('subject', xappy.FieldActions.INDEX_FREETEXT, language='en')
     conn.add_field_action('subject', xappy.FieldActions.STORE_CONTENT)
     conn.add_field_action('cc', xappy.FieldActions.INDEX_EXACT)
     conn.add_field_action('bcc', xappy.FieldActions.INDEX_EXACT)
     return conn
Exemplo n.º 9
0
    def __init__(self, base_uri, db_path):
        """Create a database writer for the specified path.

        """
        BaseDbWriter.__init__(self, base_uri, db_path)
        self.queue = Queue.Queue(1000)
        if not hasattr(self.queue, 'task_done'):

            def nop(*args):
                pass

            self.queue.task_done = nop
        if not hasattr(self.queue, 'join'):

            def nop(*args):
                pass

            self.queue.join = nop
        self.iconn = xappy.IndexerConnection(self.db_path)
    def buildIndexWithArticles(self, articles):
        conn = xappy.IndexerConnection(self.__xapianPath)

        #add priority to title field in case of ranked matching (weight=5)- index all fields and store data
        conn.add_field_action('title',
                              xappy.FieldActions.INDEX_FREETEXT,
                              weight=5,
                              language='en')
        conn.add_field_action('text',
                              xappy.FieldActions.INDEX_FREETEXT,
                              language='en')
        conn.add_field_action('chemical_exact', xappy.FieldActions.INDEX_EXACT)
        conn.add_field_action('keyword',
                              xappy.FieldActions.INDEX_FREETEXT,
                              language='en')
        conn.add_field_action('mesh',
                              xappy.FieldActions.INDEX_FREETEXT,
                              language='en')

        conn.add_field_action('text', xappy.FieldActions.STORE_CONTENT)
        conn.add_field_action('title', xappy.FieldActions.STORE_CONTENT)
        conn.add_field_action('chemical_exact',
                              xappy.FieldActions.STORE_CONTENT)
        conn.add_field_action('keyword', xappy.FieldActions.STORE_CONTENT)
        conn.add_field_action('mesh', xappy.FieldActions.STORE_CONTENT)

        for article in articles:
            doc = self.__buildDoc(article)
            if doc == None: continue
            try:
                #process doc to pdoc explicitly - not needed here
                #pdoc = conn.process(doc)
                conn.add(doc)
            except:
                continue

            PubMedXapian.__indexCount += 1
            nbs = len(PubMedXapian.__indexMsg)
            PubMedXapian.__indexMsg = "article %s indexed" % (str(
                PubMedXapian.__indexCount))
            sys.stdout.write('\b' * nbs + PubMedXapian.__indexMsg)
        conn.flush()
        conn.close()
Exemplo n.º 11
0
    def create_index(self):
        """ Create a new index, and set up its field structure """
        indexer = xappy.IndexerConnection(self.dbpath)

        indexer.add_field_action('exact_name',
                                 xappy.FieldActions.INDEX_FREETEXT)
        indexer.add_field_action('name',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)

        indexer.add_field_action('summary',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en')

        indexer.add_field_action('description',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en')

        indexer.add_field_action('subpackages',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)

        indexer.add_field_action('category_tags',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)

        indexer.add_field_action('cmd',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 spell=True)
        # FieldActions.TAG not currently supported in F15 xapian (1.2.7)
        #indexer.add_field_action('tags', xappy.FieldActions.TAG)
        indexer.add_field_action('tag',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 spell=True)

        #indexer.add_field_action('requires', xappy.FieldActions.INDEX_EXACT)
        #indexer.add_field_action('provides', xappy.FieldActions.INDEX_EXACT)

        self.indexer = indexer
Exemplo n.º 12
0
import sys
import re
import redis
import xappy
import time
try:
    import json
except ImportError:
    import simplejson as json
from django.utils.html import strip_tags

from backend.parser import TranscriptParser, MetaParser
from backend.api import Act, KeyScene, Character, Glossary, LogLine
from backend.util import seconds_to_timestamp

search_db = xappy.IndexerConnection(
    os.path.join(os.path.dirname(__file__), '..', 'xappydb'), )


def mission_time_to_timestamp(mission_time):
    """Takes a mission time string (XX:XX:XX:XX) and converts it to a number of seconds"""
    d, h, m, s = map(int, mission_time.split(':'))
    timestamp = d * 86400 + h * 3600 + m * 60 + s

    if mission_time[0] == "-":
        return timestamp * -1
    else:
        return timestamp


class TranscriptIndexer(object):
    """
Exemplo n.º 13
0
    store_dir = path.join(store_dir, 'parts', part_target)
    if path.exists(store_dir):
        assert path.isdir(store_dir)
    else:
        os.mkdir(store_dir)
    
    return store_dir

# create storage directory if not present
store_dir = setupStorageDirectory() 

# search connection hub
searcher = search.IndexSearch(store_dir)

# async indexer
indexer = xappy.IndexerConnection(store_dir)

# if synchronous debugging, setup the index connection
if iindex.DEBUG_SYNC:
    iindex.DEBUG_SYNC_IDX = indexer
    
if interfaces.DEBUG:
    queue.QueueProcessor.POLL_TIMEOUT=3
else:
    queue.QueueProcessor.POLL_TIMEOUT=5

if interfaces.DEBUG:
    searcher.hub.auto_refresh_delta = 5
else:
    searcher.hub.auto_refresh_delta = 10
Exemplo n.º 14
0
    def do_indexing(self, col, filter_settings):
        """Perform an indexing pass.

        Index the database for col using filters given by
        filter_settings.  The filename is used as the document id, and
        this is used to remove documents in the database that no
        longer have an associated file.

        continue_check will be called before each file of the
        collection is about to be processed. If it returns False then
        indexing will stop and do_indexing will return False.  If
        do_indexing attempts to index all the files then it will
        return True.

        """
        conn = None
        try:
            name = col.name
            root_logger = logging.getLogger()
            get_remote_log().info(
                "Indexing collection: %s with filter settings: %s" %
                (name, filter_settings))
            dbname = col.dbpath()

            # This will error if the directory containing the databases has
            # disappeared, but that's probably a good thing - the document
            # collection is supposed to know where its database is - if it's
            # asking for indexing of a non-existent database, then it's the
            # collection's problem not the indexer's.
            # FIXME - we should really test for the error though, so we can
            # give a better error message.
            conn = xappy.IndexerConnection(dbname)
            conn.set_max_mem_use(max_mem_proportion=0.1)

            docs_found = dict((id, False) for id in conn.iterids())

            error_count = file_count = 0
            for f in col.files():
                if self.disk_space_short(dbname):
                    # raise an exception rather than return False - we
                    # don't want to keep trying to index in this
                    # situation.
                    raise DiskSpaceShortage
                if not self._process_file(f, conn, name, filter_settings):
                    error_count += 1
                file_count += 1
                docs_found[f] = True
                if not self.continue_check(file_count, error_count):
                    get_remote_log().debug(
                        "Prematurely terminating indexing, stop flag is true")
                    return False

            for id, found in docs_found.iteritems():
                if not found:
                    get_remote_log().debug("Removing %s from %s" % (id, name))
                    conn.delete(id)

            get_remote_log().info("Indexing of %s finished" % name)
            conn.close()
            get_remote_log().debug("Changes to %s flushed" % name)
            return True

        except xappy.XapianDatabaseLockError, e:
            get_remote_log().error(
                "Attempt to index locked database: %s, ignoring" % dbname)
Exemplo n.º 15
0
 os.chdir(directory)
 recent_filePath = os.getcwd()
 # get all subdirectories
 nested_directories = os.listdir(recent_filePath)
 # check whether there are subdirectories
 if os.path.isdir(nested_directories[0]):
     print directory, nested_directories
     # change into subdirectory
     recent_filePath = os.path.join(recent_filePath, nested_directories[0])
     os.chdir(recent_filePath)
 # get all documents in the currently selected journal directory
 files = os.listdir(recent_filePath)
 # store current indexing ID (used in ids.txt)
 recent_xapianPath = str(counter)
 # open a new file connection to create a Xapian index
 conn = xappy.IndexerConnection(os.path.join(xapianPath, recent_xapianPath))
 # create field to store the full texts
 conn.add_field_action('text',
                       xappy.FieldActions.INDEX_FREETEXT,
                       language='en')
 if not use_psql:
     # create a data field to store the full text in it, e.g. while iterating over search results
     conn.add_field_action('text', xappy.FieldActions.STORE_CONTENT)
 # iterate over all journal directories
 for file_name in files:
     doc = xappy.UnprocessedDocument()
     f = open(os.path.join(recent_filePath, file_name), "r")
     text = f.read()
     f.close()
     doc.fields.append(xappy.Field("text", text))
     try:
Exemplo n.º 16
0
def main():
    tornado.options.parse_command_line()

    from apps.main.models import User
    from apps.questions.models import Question, Genre
    from mongokit import Connection
    con = Connection()
    con.register([Question, Genre, User])
    db = con.gkc

    if options.reindex_all:
        since = datetime.datetime(1979, 12, 13)
    else:
        since = options.since
        if not since:
            since = default_since
        try:
            since = datetime.datetime.strptime(since, '%Y-%m-%d %H-%M-%S')
        except ValueError:
            since = datetime.datetime.strptime(since, '%Y-%m-%d')
    if options.verbose:
        print 'since', since

    genres = {}
    authors = {}
    count = 0
    search = {'modify_date': {'$gt': since}}
    if not db.Question.find(search).count():
        if options.verbose:
            print "0 questions"
        if not options.test:
            return
    youngest = since

    indexer = xappy.IndexerConnection(settings.XAPIAN_LOCATION)
    if not indexer.get_fields_with_actions() or options.update_fields:
        indexer.add_field_action('question',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 weight=2,
                                 language='en',
                                 spell=True,
                                 stop=stopwords)
        indexer.add_field_action(
            'answer',
            xappy.FieldActions.INDEX_FREETEXT,
            language='en',
            spell=True,
        )
        indexer.add_field_action('accept',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)
        indexer.add_field_action('alternatives',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)
        indexer.add_field_action('author', xappy.FieldActions.INDEX_EXACT)
        indexer.add_field_action('genre', xappy.FieldActions.INDEX_EXACT)
        indexer.add_field_action('comment',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=False,
                                 search_by_default=False,
                                 stop=stopwords)
        indexer.add_field_action('date',
                                 xappy.FieldActions.SORTABLE,
                                 type="date")
        indexer.add_field_action('state', xappy.FieldActions.SORTABLE)

        indexer.add_field_action('question', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('answer', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('genre', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('state', xappy.FieldActions.STORE_CONTENT)

    t0 = time.time()
    for question in db.Question.collection.find(search):
        if question['modify_date'] > youngest:
            youngest = question['modify_date']
        doc = xappy.UnprocessedDocument()
        doc.fields.append(xappy.Field('state', question['state']))
        doc.fields.append(xappy.Field('question', question['text']))
        doc.fields.append(xappy.Field('answer', question['answer']))
        if question['genre'].id in genres:
            genre = genres[question['genre'].id]
        else:
            genre = db.Genre.one({'_id': question['genre'].id})
            genre = genre.name
            genres[question['genre'].id] = genre
        doc.fields.append(xappy.Field('genre', genre))
        if question['author'].id in authors:
            author = authors[question['author'].id]
        else:

            author = db.User.one({'_id': question['author'].id})
            author = author.username
            authors[question['author'].id] = author
        doc.fields.append(xappy.Field('author', author))
        doc.fields.append(xappy.Field('comment', question['comment']))
        doc.fields.append(xappy.Field('accept', '\n'.join(question['accept'])))
        doc.fields.append(
            xappy.Field('alternatives', '\n'.join(question['alternatives'])))
        doc.id = str(question['_id'])
        pdoc = indexer.process(doc)
        indexer.replace(pdoc)
        count += 1
        #if count and not count % 100:
        #    indexer.flush()
    # add a second to avoid milliseconds causing the same doc to be index over and over
    youngest += datetime.timedelta(seconds=1)
    open(since_filename, 'w').write(youngest.strftime('%Y-%m-%d %H-%M-%S\n'))

    indexer.flush()
    t1 = time.time()
    indexer.close()
    if options.verbose:
        print round(t1 - t0, 3), "seconds to index", count, "questions"

    # test
    if options.test:
        print settings.XAPIAN_LOCATION
        searcher = xappy.SearchConnection(settings.XAPIAN_LOCATION)
        text = 'FRAMCEs capitalls'
        text = "Capitol STATE"
        print searcher.spell_correct(text)
        query = searcher.query_field('question',
                                     text,
                                     default_op=searcher.OP_OR)
        results = searcher.search(query, 0, 10)
        print results.matches_estimated
        #print results.estimate_is_exact
        for result in results:
            print result.rank, result.id
            print repr(result.summarise('question')), result.data['state'][0]
            #result.data['state']

        text = 'london'
        query = searcher.query_field('answer', text, default_op=searcher.OP_OR)
        results = searcher.search(query, 0, 10)
        print results.matches_estimated
        #print results.estimate_is_exact
        for result in results:
            print result.rank, result.id
            print repr(result.summarise('question')), result.data['state'][0]
Exemplo n.º 17
0
    def create_db(self, db_path):
        """Create a xappy database at db_path.

        """
        db = xappy.IndexerConnection(db_path)
        db.close()
    def create_index(self):
        self.iconn = xappy.IndexerConnection(self.dbpath)
        self.sconn = xappy.SearchConnection(self.dbpath)

        # keys are filtered package names or "_last_run_"
        self.iconn.add_field_action('key', xappy.FieldActions.INDEX_EXACT)
Exemplo n.º 19
0
 def __enter__(self):
     self.conn = xappy.IndexerConnection(self.dbpath)
     self.conn.set_max_mem_use(MAX_MEM)
     return self.conn