def main(argv):
    if len(argv) <= 1:
        print __doc__
        sys.exit(-1)

    option = argv[1]

    if option == '-q':
        print 'getQueueStatus numIndexed %s date %s numQueued %s' % getQueueStatus()

    elif option == '-t':
        logpath = cfg.getpath('logs')
        qlogs = _getQueuedLogs(logpath)
        transformed, discarded = TransformProcess().run(logpath, qlogs)
        print transformed, discarded

    elif option == '-i':
        logpath  = cfg.getpath('logs')
        qtxts = _getQueuedText(logpath)
        indexed, discarded = IndexProcess().run(logpath, qtxts)
        print indexed, discarded

    elif option == '-b':
        messagelog.mlog.lastRequest = datetime.datetime(1990,1,1)   # enable _shouldTransform
        result = backgroundIndexTask()
        print result
Exemplo n.º 2
0
    def __init__(self):
        self.lock = threading.RLock()

        self.pathname = cfg.getpath("weblib") / self.DEFAULT_FILENAME
        self.wlib = weblib.WebLibrary(self)
        self.writer = None
        self.reset()
Exemplo n.º 3
0
def search(query, start, end):

    # search
    indexpath = cfg.getpath('archiveindex')
    searcher = lucene_logic.Searcher(pathname=indexpath)
    query = query.rewrite(searcher.reader.reader)
    hits = searcher.search(query)

    hitList = sortHits(hits, searcher.reader.maxDoc()+2000)

    # prepare for highlighter
    formatter = SimpleHTMLFormatter("<span class='highlight'>", "</span>")
    highlighter = Highlighter( formatter, QueryScorer(query))
    highlighter.setTextFragmenter(SimpleFragmenter(50))
    analyzer = StandardAnalyzer()

    # build a MatchItem list
    result = []
    for i in xrange(start,end):
        if i >= hits.length():
            break
        item = MatchItem(hitList, i)
        try:
            item.highlight(analyzer, highlighter)
        except Exception, e:
            log.exception('Error highlighting %s' % item);
        #item.explaination = str(searcher.explain(query, item.id))
        result.append(item)
Exemplo n.º 4
0
def main(argv):
    """ Helper to show structure of template """
    filename = argv[1]
    pathname = cfg.getpath('docBase')/filename
    print
    print 'File:', pathname
    print 'Date:',str(datetime.datetime.now())[:19]
    fp = file(pathname,'rb')
    template = HTMLTemplate.Template(None, fp.read())
    print template.structure()
def parseId(id):
    """ Return arc_path, filename represents by id.
        e.g. id=123456789 -> $archive/123456.zip/789

        Raises KeyError if malformed
    """
    if not id.isdigit() or len(id) != 9:
        raise KeyError, 'Invalid id: %s' % str(id)

    arc_path = cfg.getpath('archive') / id[:6]+'.zip'
    return arc_path, id[6:]
def getQueueStatus():
    """ Return the number of docs indexed and number of docs queued. """

    from minds import lucene_logic

    global totalIndexed, archive_date
    if totalIndexed < 0:
        indexpath = cfg.getpath('archiveindex')
        reader = lucene_logic.Reader(indexpath)
        totalIndexed = reader.reader.numDocs()
        # find out archive_date from the first 10 document
        for i in range(1,min(11,totalIndexed)):
            doc = reader.reader.document(i)
            d = doc.get('date')
            if d:
                archive_date = d
                break
        reader.close()
    logpath = cfg.getpath('logs')
    numQueued = len(_getQueuedText(logpath)) + len(_getQueuedLogs(logpath))
    return totalIndexed, archive_date, numQueued
def forwardTmpl(wfile, env, tmpl, renderMod, *args):

    # e.g. SCRIPT_NAME='/admin/snoop.py', tmpl='tmpl/home.html'

    #scriptname = env.get('SCRIPT_NAME','')                          # '/admin/snoop.py'
    #scriptpath, scriptfile = os.path.split(scriptname.lstrip('/'))  # 'admin', 'snoop'

    # invoke tmpl's render() method
    fp = file(cfg.getpath('docBase') / tmpl)

    template = HTMLTemplate.Template(renderMod.render, fp.read())
    wfile.write(template.render(*args))
Exemplo n.º 8
0
def main():
    import PyLucene

    setup()

    # log some system info
    platform = sys.platform
    if 'win32' in sys.platform: platform += str(sys.getwindowsversion())

    log.info('-'*70)
    log.info('%s %s', cfg.application_name, cfg.get('version.number'))
    log.info('Python %s', sys.version)
    log.info('  Platform %s', platform)
    log.info('  pwd: %s, defaultencoding: %s', os.getcwd(), sys.getdefaultencoding())
    log.info('PyLucene %s Lucene %s LOCK_DIR %s',
        PyLucene.VERSION, PyLucene.LUCENE_VERSION, PyLucene.FSDirectory.LOCK_DIR)

    # show index version
    import lucene_logic
    dbindex = cfg.getpath('archiveindex')
    reader = lucene_logic.Reader(pathname=dbindex)
    version = reader.getVersion()
    reader.close()
    log.info('  Index version %s', version)

    proxyThread = threading.Thread(target=proxyMain, name='proxy')
    #proxyThread.setDaemon(True)
    proxyThread.start()

    adminThread = PyLucene.Thread(runnable(adminMain))
    #adminThread.setDaemon(True)
    adminThread.start()

#    time.sleep(3)
    indexThread = PyLucene.Thread(runnable(indexMain))
    indexThread.start()

    # main thread sleep
    _shutdownEvent.wait()

    # shutdown
    indexThread.join()
    log.fatal('indexThread terminated.')
    adminThread.join()
    log.fatal('adminThread terminated.')
    proxyThread.join()
    log.fatal('proxyThread terminated.')
    log.fatal('End of main thread.')
Exemplo n.º 9
0
    def output(self, *args):
        # generates the content first
        self.content_text = self.template.render(*args)
        self.style_block, self.script_block, self.content_text = _split_style_script_block(self.content_text)

        # render the layout frame; insert content inside
        tpath = cfg.getpath('docBase')/self.LAYOUT_TMPL
        fp = tpath.open('rb')
        try:
            tmpl = fp.read()
        finally:
            fp.close()
        layoutTemplate = HTMLTemplate.Template(self.render_layout, tmpl)
        output = layoutTemplate.render()

        self.out.write(output)
def doSnapshot(wfile, form, str_rid, item):
    url = form.getfirst('url')
    if not url and item:
        url = item.url
    shot = snapshot.Snapshot()
    shot.fetch(url)
    spath = cfg.getpath('weblibsnapshot')/('%s.mhtml' % str_rid)
    fp = spath.open('wb')
    try:
        shot.generate(fp)
    finally:
        fp.close()
    if item:
        t = datetime.datetime.now()
        item.cached = str(t)[:10]

    response.redirect(wfile, '../snapshotFrame')
def doShowSnapshot(wfile, rid, rid_path):
    # the rid_path are really for user's information only.
    # rid alone determines where to go.
    wlib = store.getWeblib()
    item = wlib.webpages.getById(rid)
    if not item:
        wfile.write('404 not found\r\n\r\n%s not found' % rid)
        return

    filename = rid == -1 and '_.mhtml' or '%s.mhtml' % rid
    # TODO: check file exist, move to weblib? getSnapshotFile()?
    fp = (cfg.getpath('weblibsnapshot')/filename).open('rb')

    obj = mhtml.LoadedWebArchive.load_fp(fp)
    # do visit?
    # wlib.visit(item)
    response.redirect(wfile, obj.root_uri)
Exemplo n.º 12
0
def main(argv):

    if len(argv) < 2:
        print __doc__
        sys.exit(-1)

    index_path = argv[1]
    shutil.rmtree(index_path, True)

    starttime = datetime.datetime.now()
    apath = cfg.getpath('archive')
    idc = docarchive.idCounter
    idc._findIdRange()
    beginId = idc._beginId
    endId   = idc._endId
    print 'Reindex %s(#%d-%d) -> %s' % (apath, beginId, endId, index_path)
    reindex(apath, beginId, endId, index_path)
    print 'Reindex finished:', datetime.datetime.now() - starttime
Exemplo n.º 13
0
    def _findIdRange(self):
        """ Scan the $archive directory for zip files for the begin and end id. """

        apath = cfg.getpath('archive')
        files = fileutil.listdir(apath, self.arc_pattern)
        if not files:
            self._beginId = 0
            self._endId = 0
            return

        first_arc = min(files)
        last_arc  = max(files)

        first = self._findId(apath/first_arc, min)
        last  = self._findId(apath/last_arc, max)

        self._beginId = int(first_arc[:6] + first)   # would be a 9 digit id
        self._endId   = int(last_arc[:6]  + last )+1 # would be a 9 digit id
def backgroundIndexTask(forceIndex=False):
    """ This is the main task of qmsg_processor. The tasks has two phrases.

    I. Transform phrase

        Parse *.qlog
        Filtered out unwanted docs
        Transform into *.qtxt
        Add into archive

        Suspense this process when user access proxy.


    II. Index phrase

        Add *.qtxt into index
        Optimize

        During optimize, block out searching.
        (12/03/04 note: Due to GIL and PyLucene implementation, it
        will actually block out every thing, including proxy.)

        Returns transformed, index, discarded
    """

    interval= cfg.getint('indexing.interval',3)
    logpath = cfg.getpath('logs')
    now = datetime.datetime.now()

    transformed = 0
    discarded_t = 0
    indexed = 0
    discarded_i = 0

    qlogs = _getQueuedLogs(logpath)
    if forceIndex or _shouldTransform(now, interval):
        transformed, discarded_t = TransformProcess().run(logpath, qlogs)

    qtxts = _getQueuedText(logpath)
    if forceIndex or \
        (_shouldTransform(now, interval) and _shouldIndex(now, logpath, qtxts)): # first check is if there is new activity
        indexed, discarded_i = IndexProcess().run(logpath, qtxts)

    return transformed, indexed, discarded_t + discarded_i
Exemplo n.º 15
0
    def __init__(self, wfile,
        content_type='text/html',
        encoding='utf-8',
        cache_control='no-cache'):

        self.wfile = wfile
        self.content_type = content_type
        self.encoding = encoding
        self.cache_control = cache_control

        self.cookie = Cookie.SimpleCookie()

        # load template
        tpath = cfg.getpath('docBase')/self.TEMPLATE_FILE
        fp = tpath.open('rb')
        try:
            self.template = HTMLTemplate.Template(self.render, fp.read())
        finally:
            fp.close()
 def testSafeConfig(self):
    # make sure we are using safe test config
    
    keys = [n for n,v in cfg.cparser.items('path')]
    # take these items outside of test
    keys.remove('docbase')    
    keys.remove('testdoc')    
    # check that the above code do what we want
    self.assert_('data' in keys)
    self.assert_('logs' in keys)
    self.assert_('weblibsnapshot' in keys)
    self.assert_('archiveindex' in keys)
    
    for name in keys:
        self.assert_('test' in cfg.getpath(name))

    # we get test path even if we import from config
    from minds.config import cfg as config_cfg
    for name in keys:
        self.assert_('test' in config_cfg.getpath(name))
Exemplo n.º 17
0
    def __init__(self, wfile,
        content_type='text/html',
        encoding='utf-8',
        cache_control='no-cache'):

        # load template
        tpath = cfg.getpath('docBase')/self.TEMPLATE_FILE
        fp = tpath.open('rb')
        try:
            self.template = HTMLTemplate.Template(self.render, fp.read())
        finally:
            fp.close()

        # HTTP header
        wfile.write('Content-type: %s; charset=%s\r\n' % (content_type, encoding))
        if cache_control:
            wfile.write('Cache-control: %s\r\n' % (cache_control,))
        wfile.write('\r\n')

        # build encoded output stream
        self.out = codecs.getwriter(encoding)(wfile,'replace')
Exemplo n.º 18
0
def setupLogging():
    # remove any bootstrap log handler installed
    rootlog = logging.getLogger()
    map(rootlog.removeHandler, rootlog.handlers)

    syslogpath = cfg.getpath('logs')/'system.log'
    hdlr = logging.handlers.RotatingFileHandler(syslogpath, 'a', 1100000, 4)
    formatter = logging.Formatter('%(asctime)s %(name)-10s - %(message)s')
    hdlr.setFormatter(formatter)

    # work around [python-Bugs-1314519] logging run into deadlock in some error handling situation
    # https://sourceforge.net/tracker/?func=detail&atid=105470&aid=1314519&group_id=5470
    hdlr.lock = threading.RLock()

    rootlog.addHandler(hdlr)
    rootlog.setLevel(logging.DEBUG)

    # redirect stdout and stderr to log
    sys.stdout = LogFileObj(logging.getLogger('stdout'))
    sys.stderr = LogFileObj(logging.getLogger('stderr'))
    print 'stdout ready'
    print >>sys.stderr, 'stderr ready'
Exemplo n.º 19
0
    def _get_snapshot_content(self, item):
        # TODO: refactor
        filename = item.id == -1 and '_.mhtml' or '%s.mhtml' % item.id
        spath = cfg.getpath('weblibsnapshot')/filename
        if not spath.exists():
            return ''

        fp = spath.open('rb')       # TODO: check file exist, move to weblib? getSnapshotFile()?
        lwa = mhtml.LoadedWebArchive(fp)
        resp = lwa.fetch_uri(lwa.root_uri)
        if not resp:
            return ''

        # TODO: lucene_logic: use to docid is confusing with lucene's internal docid?
        # TODO: mind content-type, encoding, framed objects??
        data = resp.read()
        meta = {}
        contentBuf = StringIO.StringIO()
        result = distillML.distill(resp, contentBuf, meta=meta)
        contentBuf.seek(0)
        # TODO: what's the deal with writeHeader?
        meta, content = distillparse.parseDistillML(contentBuf, writeHeader=None)
        return content
 def _open(self):
     from minds import lucene_logic
     indexpath = cfg.getpath('archiveindex')
     self.writer = lucene_logic.Writer(indexpath)
     self.searcher = lucene_logic.Searcher(pathname=indexpath)
Exemplo n.º 21
0
def openDomainFp(*args):
    """ open the domain data file """
    filename = cfg.getpath('logs')/DOMAINFILE
    return file(filename,*args)
Exemplo n.º 22
0
 def init_index(self):
     from minds import lucene_logic
     wpath = cfg.getpath('weblibindex')
     self.index_writer = lucene_logic.Writer(wpath)
     self.index_reader = lucene_logic.Reader(wpath)
     self.index_searcher = lucene_logic.Searcher(pathname=wpath)