Exemplo n.º 1
0
 def __init__(self, th):
   BaseSearchEngine.__init__(self, th, False)
   self.__ix_writer = None
   ix_dir=os.path.join(th.prefixes[0],'index', "ix_"+str(whoosh_ix_ver))
   if not os.path.isdir(ix_dir): os.makedirs(ix_dir)
   # try to load a pre-existing index
   try: self.indexer=open_dir(ix_dir)
   except (EmptyIndexError, IndexVersionError):
     # create a new one
     try: shutil.rmtree(ix_dir, True); os.makedirs(ix_dir)
     except OSError: pass
     schema = Schema(
       kitab=ID(stored=True),
       vrr=ID(stored=True,unique=False), # version release
       nodeIdNum=ID(stored=True,unique=False), 
       title=TEXT(stored=True,field_boost=1.5, analyzer=analyzer),
       content=TEXT(stored=False,analyzer=analyzer),
       #content=TEXT(stored=False,analyzer=analyzer, vector=Frequency(analyzer=analyzer)), # with term vector
       tags=IDLIST(stored=False)
     )
     self.indexer=create_in(ix_dir,schema)
   #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema)
   self.__ix_qparser = MultifieldSQParser(("title","content",), self.indexer.schema)
   self.__ix_qparser.add_plugin(FieldAliasPlugin({
       u"kitab":(u"كتاب",),
       u"title":(u"عنوان",),
       u"tags":(u"وسوم",)})
   )
   #self.__ix_pre=whoosh.query.Prefix
   self.__ix_searcher= self.indexer.searcher()
Exemplo n.º 2
0
 def __init__(self, th):
     BaseSearchEngine.__init__(self, th, False)
     self.__ix_writer = None
     ix_dir = os.path.join(th.prefixes[0], 'index',
                           "ix_" + str(whoosh_ix_ver))
     if not os.path.isdir(ix_dir):
         os.makedirs(ix_dir)
     # try to load a pre-existing index
     try:
         self.indexer = open_dir(ix_dir)
     except (EmptyIndexError, IndexVersionError):
         # create a new one
         try:
             shutil.rmtree(ix_dir, True)
             os.makedirs(ix_dir)
         except OSError:
             pass
         schema = Schema(
             kitab=ID(stored=True),
             vrr=ID(stored=True, unique=False),  # version release
             nodeIdNum=ID(stored=True, unique=False),
             title=TEXT(stored=True, field_boost=1.5, analyzer=analyzer),
             content=TEXT(stored=False, analyzer=analyzer),
             #content = TEXT(stored = False,analyzer = analyzer,
             #vector = Frequency(analyzer = analyzer)), # with term vector
             tags=IDLIST(stored=False))
         self.indexer = create_in(ix_dir, schema)
     #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema)
     self.__ix_qparser = MultifieldSQParser((
         "title",
         "content",
     ), self.indexer.schema)
     self.__ix_qparser.add_plugin(
         FieldAliasPlugin({
             u"kitab": (u"كتاب", ),
             u"title": (u"عنوان", ),
             u"tags": (u"وسوم", )
         }))
     #self.__ix_pre = whoosh.query.Prefix
     self.__ix_searcher = self.indexer.searcher()
Exemplo n.º 3
0
class SearchEngine(BaseSearchEngine):
    def __init__(self, th):
        BaseSearchEngine.__init__(self, th, False)
        self.__ix_writer = None
        ix_dir = os.path.join(th.prefixes[0], 'index',
                              "ix_" + str(whoosh_ix_ver))
        if not os.path.isdir(ix_dir):
            os.makedirs(ix_dir)
        # try to load a pre-existing index
        try:
            self.indexer = open_dir(ix_dir)
        except (EmptyIndexError, IndexVersionError):
            # create a new one
            try:
                shutil.rmtree(ix_dir, True)
                os.makedirs(ix_dir)
            except OSError:
                pass
            schema = Schema(
                kitab=ID(stored=True),
                vrr=ID(stored=True, unique=False),  # version release
                nodeIdNum=ID(stored=True, unique=False),
                title=TEXT(stored=True, field_boost=1.5, analyzer=analyzer),
                content=TEXT(stored=False, analyzer=analyzer),
                #content = TEXT(stored = False,analyzer = analyzer,
                #vector = Frequency(analyzer = analyzer)), # with term vector
                tags=IDLIST(stored=False))
            self.indexer = create_in(ix_dir, schema)
        #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema)
        self.__ix_qparser = MultifieldSQParser((
            "title",
            "content",
        ), self.indexer.schema)
        self.__ix_qparser.add_plugin(
            FieldAliasPlugin({
                u"kitab": (u"كتاب", ),
                u"title": (u"عنوان", ),
                u"tags": (u"وسوم", )
            }))
        #self.__ix_pre = whoosh.query.Prefix
        self.__ix_searcher = self.indexer.searcher()

    def __del__(self):
        if self.__ix_writer: self.__ix_writer.commit()

    def getIndexedVersion(self, name):
        """
        return a Version-Release string if in index, otherwise return None
        """
        try:
            d = self.__ix_searcher.document(kitab=unicode(makeId(name)))
        except TypeError:
            return None
        except KeyError:
            return None
        if d:
            return d['vrr']
        return None

    def queryIndex(self, queryString):
        """return an interatable of fields dict"""
        # FIXME: the return should not be implementation specific
        try:
            r = self.__ix_searcher.search(self.__ix_qparser.parse(queryString),
                                          limit=500)
        except QueryParserError:
            return None
        return r

    def resultExcerpt(self, results, i, ki=None):
        # FIXME: this should not be implementation specific
        if not ki:
            r = results[i]
            name = r['kitab']
            v = r['vrr'].split('-')[0]
            m = self.th.getMeta().getLatestKitabV(name, v)
            ki = self.th.getCachedKitab(m['uri'])
        num = int(results[i]['nodeIdNum'])
        node = ki.getNodeByIdNum(num)
        n = ki.toc.next(node)

        if n:
            ub = n.globalOrder
        else:
            ub = -1
        txt = node.toText(ub)

        s = set()
        #results.query.all_terms(s) # return (field,term) pairs
        # return (field,term) pairs    # self.self.__ix_searcher.reader()
        s = results.q.existing_terms(self.indexer.reader(), phrases=True)
        #s = set([i.decode('utf_8') for i in s])
        terms = dict(
            map(lambda i: (i[1], i[0]),
                filter(lambda j: j[0] == 'content' or j[0] == 'title',
                       s))).keys()
        #print "txt = [%s]" % len(txt)
        terms = [i.decode('utf_8') for i in terms]
        snippet_dummy = txt[:min(len(txt), 512)]  # dummy summary
        snippet = highlight(txt,
                            terms,
                            analyzer,
                            SentenceFragmenter(sentencechars=".!?؟\n"),
                            HtmlFormatter(between=u"\u2026\n"),
                            top=3,
                            scorer=BasicFragmentScorer,
                            minscore=1,
                            order=FIRST)
        #snippet = highlight(txt, terms, analyzer,
        #     SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top = 3,
        #     scorer = BasicFragmentScorer, minscore = 1,
        #     order = FIRST)
        print snippet
        if len(snippet) > 1: return snippet
        else: return snippet_dummy

    def indexingStart(self):
        """
        should be called before any sequence of indexing Ops, reindexAll() calls this method automatically
        """
        if not self.__ix_writer:
            try:
                self.__ix_writer = self.indexer.writer()
            except OSError, e:
                print '*** whooshSearchEnfine.indexingStart: %s', e
Exemplo n.º 4
0
class SearchEngine(BaseSearchEngine):
    def __init__(self, th):
        BaseSearchEngine.__init__(self, th, False)
        self.__ix_writer = None
        ix_dir = os.path.join(th.prefixes[0], 'index',
                              "ix_" + str(whoosh_ix_ver))
        if not os.path.isdir(ix_dir): os.makedirs(ix_dir)
        # try to load a pre-existing index
        try:
            self.indexer = open_dir(ix_dir)
        except (EmptyIndexError, IndexVersionError):
            # create a new one
            try:
                shutil.rmtree(ix_dir, True)
                os.makedirs(ix_dir)
            except OSError:
                pass
            schema = Schema(
                kitab=ID(stored=True),
                vrr=ID(stored=True, unique=False),  # version release
                nodeIdNum=ID(stored=True, unique=False),
                title=TEXT(stored=True, field_boost=1.5, analyzer=analyzer),
                content=TEXT(stored=False, analyzer=analyzer),
                #content=TEXT(stored=False,analyzer=analyzer, vector=Frequency(analyzer=analyzer)), # with term vector
                tags=IDLIST(stored=False))
            self.indexer = create_in(ix_dir, schema)
        #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema)
        self.__ix_qparser = MultifieldSQParser((
            "title",
            "content",
        ), self.indexer.schema)
        self.__ix_qparser.add_plugin(
            FieldAliasPlugin({
                u"kitab": (u"كتاب", ),
                u"title": (u"عنوان", ),
                u"tags": (u"وسوم", )
            }))
        #self.__ix_pre=whoosh.query.Prefix
        self.__ix_searcher = self.indexer.searcher()

    def __del__(self):
        if self.__ix_writer: self.__ix_writer.commit()

    def getIndexedVersion(self, name):
        """
    return a Version-Release string if in index, otherwise return None
    """
        try:
            d = self.__ix_searcher.document(kitab=unicode(makeId(name)))
        except TypeError:
            return None
        except KeyError:
            return None
        if d: return d['vrr']
        return None

    def queryIndex(self, queryString):
        """return an interatable of fields dict"""
        # FIXME: the return should not be implementation specific
        try:
            r = self.__ix_searcher.search(self.__ix_qparser.parse(queryString),
                                          limit=500)
        except QueryParserError:
            return None
        return r

    def resultExcerpt(self, results, i, ki=None):
        # FIXME: this should not be implementation specific
        if not ki:
            r = results[i]
            name = r['kitab']
            v = r['vrr'].split('-')[0]
            m = self.th.getMeta().getLatestKitabV(name, v)
            ki = self.th.getCachedKitab(m['uri'])
        num = int(results[i]['nodeIdNum'])
        node = ki.getNodeByIdNum(num)
        n = ki.toc.next(node)
        if n: ub = n.globalOrder
        else: ub = -1
        txt = node.toText(ub)
        s = set()
        #results.query.all_terms(s) # return (field,term) pairs
        results.q.existing_terms(
            self.indexer.reader(), s, phrases=True
        )  # return (field,term) pairs  # self.self.__ix_searcher.reader()
        terms = dict(
            map(lambda i: (i[1], i[0]),
                filter(lambda j: j[0] == 'content' or j[0] == 'title',
                       s))).keys()
        #print "txt=[%s]" % len(txt)
        snippet = txt[:min(len(txt), 512)]  # dummy summary
        snippet = highlight(txt,
                            terms,
                            analyzer,
                            SentenceFragmenter(sentencechars=".!?؟\n"),
                            HtmlFormatter(between=u"\u2026\n"),
                            top=3,
                            scorer=BasicFragmentScorer,
                            minscore=1,
                            order=FIRST)
        #snippet=highlight(txt, terms, analyzer,
        #   SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top=3,
        #   scorer=BasicFragmentScorer, minscore=1,
        #   order=FIRST)
        return snippet

    def indexingStart(self):
        """
    should be called before any sequence of indexing Ops, reindexAll() calls this method automatically
    """
        if not self.__ix_writer: self.__ix_writer = self.indexer.writer()

    def indexingEnd(self):
        """
    should be called after a sequence of indexing Ops, reindexAll() calls this method automatically
    """
        self.__ix_writer.commit(optimize=True)
        # self.indexer.optimize() # no need for this with optimize in previous line
        self.reload()

    def reload(self):
        """
    called after commiting changes to index (eg. adding or dropping from index)
    """
        self.__ix_searcher = self.__ix_searcher.refresh(
        )  # no need to obtain new one with self.indexer.searcher()
        self.__ix_writer = None

    def dropKitabIndex(self, name):
        """
    drop search index for a given Kitab by its uri
    if you call indexingStart() before this
    then you must call indexingEnd() after it
    """
        # FIXME: it seems that this used not work correctly without commit() just after drop, this mean that reindex needs a commit in-between
        ki = self.th.getKitab(name)
        if ki: self.th.getMeta().setIndexedFlags(ki.uri, 1)
        print "dropping index for kitab name:", name,
        w, c = self.__ix_writer, False
        if not w:
            w, c = self.indexer.writer(
            ), True  # creates a writer internally if one is not defined
        # NOTE: because the searcher could be limited do a loop that keeps deleting till the query is empty
        while (w.delete_by_term('kitab', name)):
            print "*",
        print
        if c: w.commit()
        if ki: self.th.getMeta().setIndexedFlags(ki.uri, 0)

    def dropAll(self):
        # FIXME: it would be more effeciant to delete the directory
        # NOTE: see http://groups.google.com/group/whoosh/browse_thread/thread/35b1700b4e4a3d5d
        self.th.getMeta().setAllIndexedFlags(1)
        self.indexingStart()
        reader = self.indexer.reader()  # also self.__ix_searcher.reader()
        for docnum in reader.all_stored_fields():
            self.__ix_writer.delete_document(docnum)
        self.indexingEnd()
        self.th.getMeta().setAllIndexedFlags(0)

    def reindexKitab(self, name):
        """
    you need to call indexingStart() before this and indexingEnd() after it
    """
        # NOTE: this method is overridden here because we need to commit between dropping and creating a new index.
        # NOTE: can't use updateDocument because each Kitab contains many documents
        self.dropKitabIndex(name)
        self.__ix_writer.commit()
        self.indexKitab(name)

    def addDocumentToIndex(self, name, vrr, nodeIdNum, title, content, tags):
        """
    this method must be overridden in implementation specific way
    """
        if content:
            self.__ix_writer.add_document(kitab=name,
                                          vrr=vrr,
                                          nodeIdNum=unicode(nodeIdNum),
                                          title=title,
                                          content=content,
                                          tags=tags)

    def keyterms(self, kitab, vrr, nodeIdNum):
        s = self.indexer.searcher()
        dn = s.document_number(kitab=kitab,
                               vrr=vrr,
                               nodeIdNum=unicode(nodeIdNum))
        if dn == None: return None, []
        print " ## ", dn
        r = s.key_terms([dn], "content", numterms=5)
        return dn, r

    def related(self, kitab, vrr, nodeIdNum):
        dn, kt = self.keyterms(kitab, vrr, nodeIdNum)
        if not dn: return None
        for t, r in kt:
            print "term=", t, " @ rank=", r
        q = query.Or([query.Term("content", t) for (t, r) in kt])
        results = self.indexer.searcher().search(q, limit=10)
        for i, fields in enumerate(results):
            if results.docnum(i) != dn:
                print fields['kitab'], "\t\t", str(
                    fields['nodeIdNum']), "\t\t", fields['title']
Exemplo n.º 5
0
class SearchEngine(BaseSearchEngine):
    def __init__(self, th):
        BaseSearchEngine.__init__(self, th, False)
        self.__ix_writer = None
        ix_dir = os.path.join(th.prefixes[0],'index', "ix_" + str(whoosh_ix_ver))
        if not os.path.isdir(ix_dir):
            os.makedirs(ix_dir)
        # try to load a pre-existing index
        try:
            self.indexer = open_dir(ix_dir)
        except (EmptyIndexError, IndexVersionError):
            # create a new one
            try:
                shutil.rmtree(ix_dir, True)
                os.makedirs(ix_dir)
            except OSError:
                pass
            schema = Schema(
                kitab = ID(stored = True),
                vrr = ID(stored = True, unique = False), # version release
                nodeIdNum = ID(stored = True, unique = False), 
                title = TEXT(stored = True, field_boost = 1.5, analyzer = analyzer),
                content = TEXT(stored = False,analyzer = analyzer),
                #content = TEXT(stored = False,analyzer = analyzer,
                #vector = Frequency(analyzer = analyzer)), # with term vector
                tags=IDLIST(stored = False)
            )
            self.indexer = create_in(ix_dir, schema)
        #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema)
        self.__ix_qparser = MultifieldSQParser(("title","content",), self.indexer.schema)
        self.__ix_qparser.add_plugin(FieldAliasPlugin({
                u"kitab":(u"كتاب",),
                u"title":(u"عنوان",),
                u"tags":(u"وسوم",)})
        )
        #self.__ix_pre = whoosh.query.Prefix
        self.__ix_searcher =  self.indexer.searcher()

    def __del__(self):
        if self.__ix_writer: self.__ix_writer.commit()

    def getIndexedVersion(self, name):
        """
        return a Version-Release string if in index, otherwise return None
        """
        try:
            d = self.__ix_searcher.document(kitab = unicode(makeId(name)))
        except TypeError:
            return None
        except KeyError:
            return None
        if d:
            return d['vrr']
        return None

    def queryIndex(self, queryString):
        """return an interatable of fields dict"""
        # FIXME: the return should not be implementation specific
        try:
            r = self.__ix_searcher.search(self.__ix_qparser.parse(queryString), limit = 500)
        except QueryParserError:
            return None
        return r

    def resultExcerpt(self, results, i, ki = None):
        # FIXME: this should not be implementation specific
        if not ki:
            r = results[i]
            name = r['kitab']
            v = r['vrr'].split('-')[0]
            m = self.th.getMeta().getLatestKitabV(name,v)
            ki = self.th.getCachedKitab(m['uri'])
        num = int(results[i]['nodeIdNum'])
        node = ki.getNodeByIdNum(num)
        n = ki.toc.next(node)
        if n:
            ub = n.globalOrder
        else:
            ub = -1
        txt = node.toText(ub)
        s = set()
        #results.query.all_terms(s) # return (field,term) pairs
        # return (field,term) pairs    # self.self.__ix_searcher.reader() 
        results.q.existing_terms(self.indexer.reader(), s, phrases = True)
        terms = dict(
                map(lambda i: (i[1],i[0]),
                filter(lambda j: j[0] == 'content' or j[0] == 'title', s))).keys()
        #print "txt = [%s]" % len(txt)
        snippet = txt[:min(len(txt),512)] # dummy summary
        snippet = highlight(txt,
                            terms,
                            analyzer,
                            SentenceFragmenter(sentencechars = ".!?؟\n"),
                            HtmlFormatter(between = u"\u2026\n"),
                            top = 3,
                            scorer = BasicFragmentScorer,
                            minscore = 1,
                            order = FIRST)
        #snippet = highlight(txt, terms, analyzer,
        #     SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top = 3,
        #     scorer = BasicFragmentScorer, minscore = 1,
        #     order = FIRST)
        return snippet

    def indexingStart(self):
        """
        should be called before any sequence of indexing Ops, reindexAll() calls this method automatically
        """
        if not self.__ix_writer:
            try:
                self.__ix_writer = self.indexer.writer()
            except OSError, e:
                print '*** whooshSearchEnfine.indexingStart: %s', e
Exemplo n.º 6
0
class SearchEngine(BaseSearchEngine):
  def __init__(self, th):
    BaseSearchEngine.__init__(self, th, False)
    self.__ix_writer = None
    ix_dir=os.path.join(th.prefixes[0],'index', "ix_"+str(whoosh_ix_ver))
    if not os.path.isdir(ix_dir): os.makedirs(ix_dir)
    # try to load a pre-existing index
    try: self.indexer=open_dir(ix_dir)
    except (EmptyIndexError, IndexVersionError):
      # create a new one
      try: shutil.rmtree(ix_dir, True); os.makedirs(ix_dir)
      except OSError: pass
      schema = Schema(
        kitab=ID(stored=True),
        vrr=ID(stored=True,unique=False), # version release
        nodeIdNum=ID(stored=True,unique=False), 
        title=TEXT(stored=True,field_boost=1.5, analyzer=analyzer),
        content=TEXT(stored=False,analyzer=analyzer),
        #content=TEXT(stored=False,analyzer=analyzer, vector=Frequency(analyzer=analyzer)), # with term vector
        tags=IDLIST(stored=False)
      )
      self.indexer=create_in(ix_dir,schema)
    #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema)
    self.__ix_qparser = MultifieldSQParser(("title","content",), self.indexer.schema)
    self.__ix_qparser.add_plugin(FieldAliasPlugin({
        u"kitab":(u"كتاب",),
        u"title":(u"عنوان",),
        u"tags":(u"وسوم",)})
    )
    #self.__ix_pre=whoosh.query.Prefix
    self.__ix_searcher= self.indexer.searcher()

  def __del__(self):
    if self.__ix_writer: self.__ix_writer.commit()

  def getIndexedVersion(self, name):
    """
    return a Version-Release string if in index, otherwise return None
    """
    try: d=self.__ix_searcher.document(kitab=unicode(makeId(name)))
    except TypeError: return None
    except KeyError: return None
    if d: return d['vrr']
    return None

  def queryIndex(self, queryString):
    """return an interatable of fields dict"""
    # FIXME: the return should not be implementation specific
    try: r=self.__ix_searcher.search(self.__ix_qparser.parse(queryString), limit=500)
    except QueryParserError: return None
    return r

  def resultExcerpt(self, results, i, ki=None):
    # FIXME: this should not be implementation specific
    if not ki:
      r=results[i]
      name=r['kitab']
      v=r['vrr'].split('-')[0]
      m=self.th.getMeta().getLatestKitabV(name,v)
      ki=self.th.getCachedKitab(m['uri'])
    num=int(results[i]['nodeIdNum'])
    node=ki.getNodeByIdNum(num)
    n=ki.toc.next(node)
    if n: ub=n.globalOrder
    else: ub=-1
    txt=node.toText(ub)
    s=set()
    #results.query.all_terms(s) # return (field,term) pairs 
    results.q.existing_terms(self.indexer.reader(), s, phrases=True) # return (field,term) pairs  # self.self.__ix_searcher.reader()
    terms=dict(
      map(lambda i: (i[1],i[0]),
      filter(lambda j: j[0]=='content' or j[0]=='title', s))).keys()
    #print "txt=[%s]" % len(txt)
    snippet=txt[:min(len(txt),512)] # dummy summary
    snippet=highlight(txt, terms, analyzer,
      SentenceFragmenter(sentencechars = ".!?؟\n"), HtmlFormatter(between=u"\u2026\n"),
      top=3, scorer=BasicFragmentScorer, minscore=1, order=FIRST)
    #snippet=highlight(txt, terms, analyzer,
    #   SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top=3,
    #   scorer=BasicFragmentScorer, minscore=1,
    #   order=FIRST)
    return snippet

  def indexingStart(self):
    """
    should be called before any sequence of indexing Ops, reindexAll() calls this method automatically
    """
    if not self.__ix_writer: self.__ix_writer=self.indexer.writer()

  def indexingEnd(self):
    """
    should be called after a sequence of indexing Ops, reindexAll() calls this method automatically
    """
    self.__ix_writer.commit(optimize=True)
    # self.indexer.optimize() # no need for this with optimize in previous line
    self.reload()

  def reload(self):
    """
    called after commiting changes to index (eg. adding or dropping from index)
    """
    self.__ix_searcher = self.__ix_searcher.refresh() # no need to obtain new one with self.indexer.searcher()
    self.__ix_writer = None

  def dropKitabIndex(self, name):
    """
    drop search index for a given Kitab by its uri
    if you call indexingStart() before this
    then you must call indexingEnd() after it
    """
    # FIXME: it seems that this used not work correctly without commit() just after drop, this mean that reindex needs a commit in-between
    ki=self.th.getKitab(name)
    if ki: self.th.getMeta().setIndexedFlags(ki.uri, 1)
    print "dropping index for kitab name:", name,
    w, c = self.__ix_writer, False
    if not w: w, c=self.indexer.writer(), True # creates a writer internally if one is not defined
    # NOTE: because the searcher could be limited do a loop that keeps deleting till the query is empty
    while(w.delete_by_term('kitab', name)):
      print "*",
    print
    if c: w.commit()
    if ki: self.th.getMeta().setIndexedFlags(ki.uri, 0)

  def dropAll(self):
    # FIXME: it would be more effeciant to delete the directory
    # NOTE: see http://groups.google.com/group/whoosh/browse_thread/thread/35b1700b4e4a3d5d
    self.th.getMeta().setAllIndexedFlags(1)
    self.indexingStart()
    reader = self.indexer.reader() # also self.__ix_searcher.reader()
    for docnum in reader.all_stored_fields():
      self.__ix_writer.delete_document(docnum)
    self.indexingEnd()
    self.th.getMeta().setAllIndexedFlags(0)

  def reindexKitab(self,name):
    """
    you need to call indexingStart() before this and indexingEnd() after it
    """
    # NOTE: this method is overridden here because we need to commit between dropping and creating a new index.
    # NOTE: can't use updateDocument because each Kitab contains many documents
    self.dropKitabIndex(name); self.__ix_writer.commit(); self.indexKitab(name)

  def addDocumentToIndex(self, name, vrr, nodeIdNum, title, content, tags):
    """
    this method must be overridden in implementation specific way
    """
    if content: self.__ix_writer.add_document(kitab=name, vrr=vrr, nodeIdNum=unicode(nodeIdNum), title=title, content=content, tags=tags)

  def keyterms(self, kitab, vrr, nodeIdNum):
    s = self.indexer.searcher()
    dn = s.document_number(kitab=kitab, vrr=vrr, nodeIdNum=unicode(nodeIdNum))
    if dn == None: return None,[]
    print " ## ", dn
    r=s.key_terms([dn], "content", numterms=5)
    return dn,r

  def related(self, kitab, vrr, nodeIdNum):
    dn,kt=self.keyterms(kitab, vrr, nodeIdNum)
    if not dn: return None
    for t,r in kt:
      print "term=", t, " @ rank=",r
    q = query.Or([query.Term("content", t) for (t,r) in kt])
    results = self.indexer.searcher().search(q, limit=10)
    for i, fields in enumerate(results):
      if results.docnum(i) != dn:
        print fields['kitab'],"\t\t",str(fields['nodeIdNum']),"\t\t",fields['title']