Пример #1
0
 def flush(self, notice=None, force=False):
   from fooling.indexer import Indexer
   if force:
     self._last_unindexed_loc = len(self)-1
   if self._last_unindexed_loc:
     indexer = Indexer(self, verbose=self.verbose)
     prevloc = int(self.index_lastloc() or '-1')
     lastloc = int(self._last_unindexed_loc)
     # notice is a function that receives the number of docs being indexed.
     if notice:
       notice(lastloc - prevloc)
     for i in xrange(prevloc+1, lastloc+1):
       indexer.index_doc(str(i), indexyomi=config.INDEX_YOMI)
     indexer.finish()
     self.merge(force)
     self._last_unindexed_loc = None
   return
Пример #2
0
def index(argv):
  import getopt, locale
  def usage():
    print 'usage: %s [-v] [-F|-U|-N|-R] [-Y] [-b basedir] [-p prefix] [-c corpustype] [-t doctype] [-e encoding] [-D maxdocs] [-T maxterms] idxdir [file ...]' % argv[0]
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(argv[1:], 'vFURNYb:p:c:t:e:D:T:')
  except getopt.GetoptError:
    usage()
  verbose = 1
  mode = 0
  basedir = ''
  prefix = 'idx'
  corpustype = corpus.FilesystemCorpus
  doctype = document.PlainTextDocument
  encoding = locale.getpreferredencoding()
  maxdocs = 1000
  maxterms = 50000
  indexstyle = 'normal'
  for (k, v) in opts:
    if k == '-d': verbose += 1
    elif k == '-F': mode = 0 # force
    elif k == '-U': mode = 1 # update only
    elif k == '-N': mode = 2 # new document only
    elif k == '-R': mode = 3 # reset
    elif k == '-Y': indexstyle = 'yomi'
    elif k == '-b': basedir = v
    elif k == '-p': prefix = v
    elif k == '-c': corpustype = corpus.get_corpustype(v)
    elif k == '-t': doctype = document.get_doctype(v)
    elif k == '-e': encoding = v
    elif k == '-D': maxdocs = int(v)
    elif k == '-T': maxterms = int(v)
  if not args: usage()
  assert len(prefix) == 3
  idxdir = args[0]
  cps = corpustype(basedir, doctype, encoding, indexstyle)
  cps.open()
  indexdb = IndexDB(idxdir, prefix)
  try:
    indexdb.create()
  except IndexDB.IndexDBError:
    pass
  indexdb.open()
  if mode == 3:
    indexdb.reset()
    mode = 0
  indexer = Indexer(indexdb, cps, maxdocs, maxterms, verbose=verbose)
  print >>sys.stderr, \
        'Index: basedir=%r, idxdir=%r, max_docs_threshold=%d, max_terms_threshold=%d ' % \
        (basedir, idxdir, maxdocs, maxterms)

  files = args[1:]
  lastmod = indexdb.index_mtime()
  if not files:
    files = sys.stdin
  for fname in files:
    fname = fname.strip()
    if not cps.loc_exists(fname): continue
    if indexdb.loc_indexed(fname):
      if mode == 2 or ((mode == 1) and cps.loc_mtime(fname) < lastmod): continue
    indexer.index_loc(fname)

  indexer.finish()
  cps.close()
  print >>sys.stderr, 'Done.'
  return