Пример #1
0
 def __init__(self, dirname, verbose=False):
   self.verbose = verbose
   self.dirname = dirname
   self.mode = None
   self._last_unindexed_loc = None
   self._db = TarDB(os.path.join(dirname, 'tar'))
   self._labeldb = LabelDB(os.path.join(dirname, 'label'))
   Corpus.__init__(self, os.path.join(dirname, 'idx'), 'idx')
   return
Пример #2
0
 def create(dirname):
   os.mkdir(os.path.join(dirname, 'tar'))
   os.mkdir(os.path.join(dirname, 'idx'))
   os.mkdir(os.path.join(dirname, 'label'))
   TarDB.create(os.path.join(dirname, 'tar'))
   return
Пример #3
0
 def __init__(self, basedir):
     self.basedir = basedir
     self._tar = TarDB(os.path.join(basedir, 'tar'))
     self._text = TextDB(os.path.join(basedir, 'text'))
     return
Пример #4
0
class MailCorpus(Corpus):

  class MailCorpusError(Exception): pass
  class DatabaseLocked(MailCorpusError): pass

  SMALL_MERGE = 20
  LARGE_MERGE = 2000

  singleton_handler = None
  @classmethod
  def register_singleton_handler(klass, handler):
    klass.singleton_handler = handler
    return
  
  def _get_singleton(self):
    return MailCorpus.singleton_handler(self.dirname)

  def __getstate__(self):
    odict = Corpus.__getstate__(self)
    # there odict values are never treated seriously.
    del odict['mode']
    del odict['_db']
    del odict['_labeldb']
    del odict['_last_unindexed_loc']
    return odict

  def __init__(self, dirname, verbose=False):
    self.verbose = verbose
    self.dirname = dirname
    self.mode = None
    self._last_unindexed_loc = None
    self._db = TarDB(os.path.join(dirname, 'tar'))
    self._labeldb = LabelDB(os.path.join(dirname, 'label'))
    Corpus.__init__(self, os.path.join(dirname, 'idx'), 'idx')
    return

  def __len__(self):
    return len(self._db)
  
  def __repr__(self):
    return '<MailCorpus: dirname=%r, db=%r, last_unindexed_loc=%r>' % \
           (self.dirname, self._db, self._last_unindexed_loc)

  @staticmethod
  def create(dirname):
    os.mkdir(os.path.join(dirname, 'tar'))
    os.mkdir(os.path.join(dirname, 'idx'))
    os.mkdir(os.path.join(dirname, 'label'))
    TarDB.create(os.path.join(dirname, 'tar'))
    return

  def set_writable(self):
    if self.mode == 'r+': return
    if self.mode == 'r':
      self.close()
    try:
      self.open('r+')
    except MailCorpus.DatabaseLocked:
      self.open('r')
      raise
    return

  def get_labeldb(self):
    return self._labeldb

  def open(self, mode='r'):
    try:
      self._db.open(mode)
    except TarDB.LockError:
      raise MailCorpus.DatabaseLocked('Database locked.')
    self._last_unindexed_loc = None
    self.mode = mode
    return

  def merge(self, large=False):
    from fooling.merger import Merger
    docs_threshold = self.SMALL_MERGE
    if large:
      docs_threshold = self.LARGE_MERGE
    Merger(self, max_docs_threshold=docs_threshold).run(True)
    return

  def flush(self, notice=None, force=False):
    from fooling.indexer import Indexer
    if force:
      self._last_unindexed_loc = len(self)-1
    if self._last_unindexed_loc:
      indexer = Indexer(self, verbose=self.verbose)
      prevloc = int(self.index_lastloc() or '-1')
      lastloc = int(self._last_unindexed_loc)
      # notice is a function that receives the number of docs being indexed.
      if notice:
        notice(lastloc - prevloc)
      for i in xrange(prevloc+1, lastloc+1):
        indexer.index_doc(str(i), indexyomi=config.INDEX_YOMI)
      indexer.finish()
      self.merge(force)
      self._last_unindexed_loc = None
    return

  def close(self, notice=None):
    self.flush(notice)
    self.mode = None
    self._db.close()
    self._labeldb.close()
    return
  
  def get_message(self, loc):
    (info, data) = self._db.get_record(int(loc))
    fp = gzip.GzipFile(fileobj=StringIO.StringIO(data))
    data = fp.read()
    fp.close()
    return data
    
  def add_message(self, data, labels, mtime=0):
    import time
    info = TarInfo(self._labels2name(len(self._db), labels))
    info.mtime = mtime or int(time.time())
    fp = StringIO.StringIO()
    gz = gzip.GzipFile(mode='w', fileobj=fp)
    gz.write(data)
    gz.close()
    recno = self._db.add_record(info, fp.getvalue())
    self._labeldb.add_label(recno, labels)
    self._last_unindexed_loc = str(recno)
    return self._last_unindexed_loc

  # Internal routine to access TarDB.
  def _labels2name(self, recno, labels):
    labels = ''.join(sorted(labels))
    if labels and not labels.isalnum():
      raise AssertionError('Invalid labels: %r' % labels)
    return '%08x.%s' % (recno, labels)
  
  FILENAME_PAT = re.compile(r'[0-9a-f]{8}\.(.*)')
  def _name2labels(self, name):
    m = self.FILENAME_PAT.match(name)
    if not m:
      raise AssertionError('Invalid file name: %r' % name)
    return set(m.group(1))
    
  def get_message_labels(self, loc):
    info = self._db.get_info(int(loc))
    return self._name2labels(info.name)
    
  def add_message_label(self, loc, labels):
    recno = int(loc)
    info = self._db.get_info(recno)
    labels1 = self._name2labels(info.name).union(set(labels))
    info.name = self._labels2name(recno, labels1)
    self._db.set_info(recno, info)
    self._labeldb.add_label(recno, labels)
    return

  def del_message_label(self, loc, labels):
    recno = int(loc)
    info = self._db.get_info(recno)
    labels1 = self._name2labels(info.name).difference(set(labels))
    info.name = self._labels2name(recno, labels1)
    self._db.set_info(recno, info)
    self._labeldb.del_label(recno, labels)
    return

  def mark_deleted(self, loc):
    self.add_message_label(loc, config.LABEL4DELETED)
    return

  # Corpus methods
  def loc_exists(self, loc):
    recno = int(loc)
    return 0 <= recno and recno < len(self._db)

  def loc_fp(self, loc):
    return StringIO.StringIO(self.get_message(loc))

  def loc_mtime(self, loc):
    info = self._db.get_info(int(loc))
    return info.mtime

  def loc_size(self, loc):
    return len(self.get_message(loc))

  def get_doc(self, loc):
    info = self._db.get_info(int(loc))
    return EMailDocumentWithLabel(self, loc, info.mtime)
Пример #5
0
class MessageDB:

    MAX_TEXT_SIZE = 100000
    
    def __init__(self, basedir):
        self.basedir = basedir
        self._tar = TarDB(os.path.join(basedir, 'tar'))
        self._text = TextDB(os.path.join(basedir, 'text'))
        return

    def create(self):
        os.makedirs(self.basedir)
        self._tar.create()
        self._text.create()
        return

    def open(self):
        self._tar.open()
        self._text.open()
        return

    def close(self):
        self._tar.close()
        self._text.close()
        return
    
    def flush(self):
        self._tar.flush()
        self._text.flush()
        return

    def add_file(self, data):
        recno = self._tar.next_recno()
        info = TarInfo('%08d' % recno)
        self._tar.add_record(info, bytes2gzip(data))
        msg = message_from_bytes(data)
        text = cutoff(msg2str(msg), self.MAX_TEXT_SIZE)
        self._text.add_text(recno, text)
        for tag in msg2tags(msg):
            self._text.add_tag(recno, tag)
        return recno

    def search_tag(self, tags):
        result = None
        for tag in tags:
            recs = set(self._text.search_tag(tag))
            if result is None:
                result = recs
            else:
                result.update_intersection(recs)
        for recno in sorted(result, reverse=True):
            yield self._text.get_text(recno)
        return
    
    def search_text(self, qs):
        result = None
        for q in qs:
            recs = set(self._text.search_text(q))
            if result is None:
                result = recs
            else:
                result.update_intersection(recs)
        for recno in sorted(result, reverse=True):
            yield self._text.get_text(recno)
        return