Пример #1
0
 def getRelatedDocIds(self, cutoff=7):
     return set([doc
                 for pippi in Pippies.find({'len': { '$gte': int(cutoff)},
                                            'docs': self._id},
                                           ['docs'])
                 for doc in pippi['docs']
                 if doc != self._id])
Пример #2
0
 def getRelatedDocIds(self, cutoff=7):
     return set([doc
                 for pippi in Pippies.find({'len': { '$gte': int(cutoff)},
                                            'docs': self._id},
                                           ['docs'])
                 for doc in pippi['docs']
                 if doc != self._id])
Пример #3
0
def search(request):
    q = cgi.escape(request.GET.get('q',''))
    if not q:
        return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request))

    orderBy = cgi.escape(request.GET.get('orderby',''))
    # TODO also order by docslen (need to add that to bulksaver)
    if not orderBy in ['relevance', 'docslen', 'len', ]: orderBy='len'
    # TODO also handle desc/asc via the tableheader on the web ui
    orderDesc = True
    engine = hunspell.HunSpell(settings.DICT+'.dic', settings.DICT+'.aff')
    filtr=[]
    for word in [token for token in nltk.tokenize.wordpunct_tokenize(unicode(q))]:
        # stem each word
        stem=engine.stem(word.encode('utf8'))
        if stem:
            filtr.append(stem[0])
        else:
            filtr.append('')
    template_vars=pager(request,Pippies.find({'pippi': re.compile(' '.join(filtr))}),orderBy,orderDesc)
    template_vars['pippies']=[{'id': pippi['_id'],
                               'pippi':'%s<span class="hilite-query">%s</span>%s' % ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]).partition(' '.join([p if p else '*' for p in filtr])),
                               'docslen':pippi['docslen'],
                               'len':len(pippi['pippi'].split(' ')),
                               'relevance':pippi.get('relevance',0),}
                               for pippi in template_vars['data']]
    template_vars['getparams']=request.GET.urlencode()
    template_vars['q']=q
    return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
Пример #4
0
def pippies(request):
    filtr = {}
    template_vars = {}
    docfilter = None
    relfilter = None
    cutoff = None
    try:
        cutoff = int(cgi.escape(request.GET.get('cutoff', '7')))
    except:
        pass
    if cutoff: filtr['len'] = {'$gte': cutoff}
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc', '')))
    except:
        pass
    if docfilter:
        filtr['docs'] = docfilter
    try:
        relfilter = int(cgi.escape(request.GET.get('relevance', '')))
    except:
        pass
    if relfilter: filtr['relevance'] = relfilter
    # todo add sortable column headers ala http://djangosnippets.org/snippets/308/
    orderBy = cgi.escape(request.GET.get('orderby', 'relevance'))
    orderDesc = True if '1' == cgi.escape(request.GET.get('desc',
                                                          '1')) else False
    template_vars = pager(request, Pippies.find(filtr), orderBy, orderDesc)
    template_vars['pippies'] = [{
        'id':
        pippi['_id'],
        'pippi':
        ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]),
        'docslen':
        len(pippi['docs']),
        'relevance':
        pippi.get('relevance', 0),
    } for pippi in template_vars['data']]
    template_vars['doc'] = docfilter
    if docfilter:
        doc = Docs.find_one({'_id': docfilter}, ['docid', 'title'])
        template_vars[
            'docTitle'] = doc['title'] if 'title' in doc else doc['docid']
    return render_to_response('pippies.html',
                              template_vars,
                              context_instance=RequestContext(request))
Пример #5
0
def main():
    print "updateing pippies.relevance"
    pippies=Pippies.find({},['docs','len'])
    pippieslen=pippies.count()
    i=1
    for pippi in pippies:
        if (i*100/pippieslen)!=((i-1)*100/pippieslen):
            if (i*100/pippieslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/pippieslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        if len(pippi['docs'])>0:
                Pippies.update({'_id' : pippi['_id']},
                               { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])),
                                           'docslen': len(pippi['docs']),}, })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()

    print "updateing docs.idf"
    docs=Docs.find({},['termcnt','docid','stemsid','rawid'])
    docslen=docs.count()
    i=1
    for dd in docs:
        if (i*100/docslen)!=((i-1)*100/docslen):
            if (i*100/docslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/docslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()
Пример #6
0
def pippies(request):
    filtr={}
    template_vars={}
    docfilter=None
    relfilter=None
    cutoff=None
    try:
        cutoff = int(cgi.escape(request.GET.get('cutoff','7')))
    except:
        pass
    if cutoff: filtr['len']={ '$gte': cutoff }
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc','')))
    except:
        pass
    if docfilter:
        filtr['docs']=docfilter
    try:
        relfilter =  int(cgi.escape(request.GET.get('relevance','')))
    except:
        pass
    if relfilter: filtr['relevance']=relfilter
    # todo add sortable column headers ala http://djangosnippets.org/snippets/308/
    orderBy = cgi.escape(request.GET.get('orderby','relevance'))
    orderDesc = True if '1'==cgi.escape(request.GET.get('desc','1')) else False
    template_vars=pager(request,Pippies.find(filtr),orderBy,orderDesc)
    template_vars['pippies']=[{'id': pippi['_id'],
                               'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]),
                               'docslen':len(pippi['docs']),
                               'relevance':pippi.get('relevance',0),}
                               for pippi in template_vars['data']]
    template_vars['doc']=docfilter
    if docfilter:
        doc=Docs.find_one({'_id': docfilter},['docid', 'title'])
        template_vars['docTitle']=doc['title'] if 'title' in doc else doc['docid']
    return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))