def addDocs(self,d1,d2): Docs.update({"_id" : d1._id}, { '$push' : { 'pippiDocs' : d2._id }, '$inc' : { 'pippiDocsLen' : 1 }}) Docs.update({"_id" : d2._id}, { '$push' : { 'pippiDocs' : d1._id }, '$inc' : { 'pippiDocsLen' : 1 }})
def pippi(request, refdoc=None): if not refdoc: return render_to_response('error.html', {'error': 'specify document: %s!' % refdoc}, context_instance=RequestContext(request)) refdoc = Doc(docid=refdoc) template_vars = pager(request, Docs.find({}, ['_id', 'docid']), 'docid', False) docs = sorted([(doc['docid'], doc['_id']) for doc in template_vars['data']]) docslen = Docs.count() template_vars['docs'] = [{ 'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'job': not doc._id in refdoc.pippiDocs, 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(docid=d) for d, oid in docs if not oid == refdoc._id)] template_vars['stats'] = getOverview() template_vars['refdoc'] = refdoc.docid template_vars['reftitle'] = refdoc.title template_vars['oid'] = str(refdoc._id) template_vars['starred'] = request.session.get('starred', set()) return render_to_response('pippi.html', template_vars, context_instance=RequestContext(request))
def __init__(self,raw=None,docid=None,oid=None,d=None): if oid: # get by mongo oid d=Docs.find_one({"_id": oid}) elif docid: # get by docid d=Docs.find_one({"docid": docid}) if d: # load the values self.__dict__.update(d) elif raw: # create a new document self.__dict__.update({ 'docid' : docid, 'pippies' : [], 'pippiDocs' : [], 'pippiDocsLen' : 0, 'rawid' : None, }) if not 'type' in self.__dict__: self.__dict__['type']='raw' if not 'metadata' in self.__dict__: self.__dict__['metadata']={} if raw: self.raw=raw self.lang=guessLanguage(" ".join(self.text)) self.save() else: raise KeyError('empty docid')
def __init__(self,raw=None,docid=None,oid=None,d=None,owner=None): if oid: # get by mongo oid d=Docs.find_one({"_id": oid}) elif docid: # get by docid d=Docs.find_one({"docid": docid}) if d: # load the values self.__dict__.update(d) elif raw: # create a new document self.__dict__.update({ 'docid' : docid, 'owner': unicode(owner), 'pippies' : [], 'pippiDocs' : [], 'pippiDocsLen' : 0, 'rawid' : None, 'title': docid, }) if not 'type' in self.__dict__: self.__dict__['type']='raw' if not 'metadata' in self.__dict__: self.__dict__['metadata']={} self.raw=raw self.__dict__['lang']=guessLanguage(" ".join(self.text)) self.stems # for caching self.save() else: raise KeyError('empty docid')
def delete(self): for key, col in self.fieldMap.items(): try: col.remove({'_id': self.__dict__["%sid" % key]}) except: pass # TODO also remove pippies and other stuff cached after pippring Docs.remove({'_id': self.__dict__['_id']})
def getOverview(): stats=[] stats.append({'title': 'Total documents', 'value': Docs.count(), 'text': "%s Documents" % Docs.count()}) stats.append({'title': 'Total Pippies', 'value': Pippies.count(), 'text': "with %s Pippies" % Pippies.count()}) stats.append({'title': 'Locations', 'value': Frags.count(), 'text': "in %s Locations" % Frags.count()}) return stats
def save(self,d1,d2,pkt): # todo new code to directly addtoset mongo-style if not pkt: return pippi=Pippi(pkt['pippi']) Docs.update({'_id': d1._id}, { '$addToSet' : { 'pippies' : pippi._id } }) Docs.update({'_id': d2._id}, { '$addToSet' : { 'pippies' : pippi._id } }) Pippies.update({'_id' : pippi._id}, {'$addToSet': { 'docs' : { '$each' : [d for d in [d1._id, d2._id]]}}, '$inc' : { 'docslen' : 2 }}) [Frags.save({'pos': p['pos'], 'txt': p['txt'], 'l': pkt['l'], 'doc': d, 'pippi': pippi._id}) for (d,p) in [(d1._id, p) for p in pkt['d1ps']]+[(d2._id, p) for p in pkt['d2ps']]] return pkt
def filterDocs(request): q=request.GET.get('q') query={} if q: query={'title': re.compile(q, re.I)} if request.GET.get('starred')=='true': query['_id']={ '$in': [ObjectId(x) for x in request.session.get('starred',())] } if request.GET.get('mine')=='true': query['owner']=unicode(request.user) res=pager(request,Docs.find(query, sort=[('_id',pymongo.DESCENDING)]),'docid',False) starred=request.session.get('starred',set()) res['docs']=[{'id': doc.docid, 'starred': u'\u2605' if str(doc._id) in starred else u'\u2606', 'starclass': 'starred' if str(doc._id) in starred else '', 'title': doc.title, 'meta': doc.metadata, 'oid': str(doc._id), 'indexed': doc.pippiDocs, 'pippies': len(doc.pippies), 'type': doc.type, 'tags': doc.autoTags(25), } for doc in (Doc(d=d) for d in res['data'])] return HttpResponse(jdump(res),mimetype="application/json")
def __init__(self, docid=None, *args,**kwargs): self.__dict__['type'] = 'etherpad' if docid: hostValidator = PADRE.search(docid) if hostValidator: if hostValidator.group(2) and hostValidator.group(3): docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8') kwargs['docid']=docid url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8') doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8')) raw=str(tidy.parseString(doc, **{'output_xhtml' : 1, 'add_xml_decl' : 0, 'indent' : 0, 'tidy_mark' : 0, 'doctype' : "strict", 'wrap' : 0})) kwargs['raw'] = raw kwargs['docid']=docid super(Etherpad,self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid']=docid super(Etherpad,self).__init__(*args, **kwargs)
def filterDocs(request): q = request.GET.get('q') query = {} if q: query = {'title': re.compile(q, re.I)} if request.GET.get('starred') == 'true': query['_id'] = { '$in': [ObjectId(x) for x in request.session.get('starred', ())] } if request.GET.get('mine') == 'true': query['owner'] = unicode(request.user) res = pager(request, Docs.find(query, sort=[('_id', pymongo.DESCENDING)]), 'docid', False) starred = request.session.get('starred', set()) res['docs'] = [{ 'id': doc.docid, 'starred': u'\u2605' if str(doc._id) in starred else u'\u2606', 'starclass': 'starred' if str(doc._id) in starred else '', 'title': doc.title, 'meta': doc.metadata, 'oid': str(doc._id), 'indexed': doc.pippiDocs, 'pippies': len(doc.pippies), 'type': doc.type, 'tags': doc.autoTags(25), } for doc in (Doc(d=d) for d in res['data'])] return HttpResponse(jdump(res), mimetype="application/json")
def __init__(self, docid=None, *args,**kwargs): self.__dict__['type'] = 'co-ment' if docid: hostValidator = CMTRE.search(docid) if hostValidator: if hostValidator.group(1) or hostValidator.group(3) or hostValidator.group(5): docid=("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8') kwargs['docid']=docid url="https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8') dataurl = "https://%s/text%s/comments/" % (hostValidator.group(2), hostValidator.group(4)) data = urllib2.urlopen(dataurl).read() soup = BeautifulSoup(data) kwargs['raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (self.title, unescape(unicode(soup.find(attrs={'id' : 'textcontainer'}))).encode('utf8')) kwargs['docid']=docid super(Coment,self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid']=docid super(Coment,self).__init__(*args, **kwargs)
def docView(request, doc=None, cutoff=10): if request.GET.get('cutoff', 0): cutoff = int(request.GET['cutoff']) if not doc or not cutoff: return render_to_response( 'error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc, owner=request.user) except: raise Http404 cont = d.body relDocs = Docs.find( {'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff)) }}, ['docid', 'title']) return render_to_response('docView.html', { 'doc': d, 'oid': d._id, 'user': request.user, 'content': cont, 'related': relDocs, 'cutoff': cutoff, 'cutoffs': ','.join(cutoffSL(d, cutoff)), 'len': d.getFrags(cutoff=cutoff).count() }, context_instance=RequestContext(request))
def starred(request): template_vars=pager(request, Docs.find({'_id' : { '$in': [ObjectId(x) for x in request.session.get('starred',())] }}, sort=[('docid',pymongo.ASCENDING)]), 'docid',False) template_vars['title']='Your starred documents' return _listDocs(request, template_vars)
def getOverview(): stats = [] stats.append({ 'title': 'Total documents', 'value': Docs.count(), 'text': "%s Documents" % Docs.count() }) stats.append({ 'title': 'Total Pippies', 'value': Pippies.count(), 'text': "with %s Pippies" % Pippies.count() }) stats.append({ 'title': 'Locations', 'value': Frags.count(), 'text': "in %s Locations" % Frags.count() }) return stats
def __init__(self, docid=None, *args,**kwargs): setTitle=False if docid: alias=re.match(SHORTCUTRE,docid) if alias: self.__dict__['sector'] = '3' self.__dict__['year'] = alias.group(2) self.__dict__['doctype'] = SHORTCUTMAP[alias.group(1)] self.__dict__['refno'] = "%04d" % int(alias.group(3)) self.__dict__['lang'] = 'EN' # assuming default else: (code,lang)=docid.split(":")[1:3] st=7 if code[6].isalpha() else 6 self.__dict__['sector'] = code[0] self.__dict__['year'] = code[1:5] self.__dict__['doctype'] = code[5:st] self.__dict__['refno'] = code[st:] self.__dict__['lang'] = lang self.__dict__['type'] = 'eurlex' kwargs['docid']=self.docid if not Docs.find_one({"docid": self.docid}): retries=4 while True: raw=CACHE.fetchUrl(EURLEXURL+self.docid+":HTML") soup=BeautifulSoup(raw) # TODO handle empty or invalid celex ids - also handle other languages besides english!!! # <TITLE>Request Error</TITLE> # <h1>The parameters of the link are incorrect.</h1> if soup.title and soup.title.string == "Request Error": if retries>0: retries=retries-1 continue else: raise ValueError, "Request Error" if soup.h1 and soup.h1.string == 'The parameters of the link are incorrect.': if retries>0: retries=retries-1 continue else: raise ValueError, "Parameter Error" # no errors found, continue, nothing to see here break # > /* There is no English version of this document available since it was not included in the English Special Edition. content=soup.find(id='TexteOnly') if (content and content.findAll('p') and len(content.findAll('p'))>1 and 'string' in dir(content.findAll('p')[1]) and content.findAll('p')[1].string.strip().startswith('/* There is no English version of this document available since it was not included in the English Special Edition.')): raise ValueError, "Language Error" kwargs['raw']=anchorArticles(raw) self.__dict__['metadata'] = self.extractMetadata() setTitle=True super(Eurlex,self).__init__(*args, **kwargs) if setTitle: self.__dict__['title']=self._gettitle() self.save()
def starred(request): template_vars=pager(request,Docs.find({'_id' : { '$in': [ObjectId(x) for x in request.session.get('starred',())] }},['_id','docid']),'docid',False) docs=[(doc['docid'],doc['_id']) for doc in template_vars['data']] docslen=Docs.count() template_vars['docs']=[{'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(docid=d) for d,oid in docs)] template_vars['stats']=getOverview() template_vars['starred']=request.session.get('starred',set()) template_vars['title']='Your starred documents' return render_to_response('corpus.html', template_vars, context_instance=RequestContext(request))
def starred(request): template_vars = pager( request, Docs.find( { '_id': { '$in': [ObjectId(x) for x in request.session.get('starred', ())] } }, sort=[('docid', pymongo.ASCENDING)]), 'docid', False) template_vars['title'] = 'Your starred documents' return _listDocs(request, template_vars)
def metaView(request,doc=None): if not doc: return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc) except: form = UploadForm({'docid': doc}) return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request)) relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title']) return render_to_response('meta.html', {'doc': d, 'related': relDocs, 'metadata': d.metadata, }, context_instance=RequestContext(request))
def metaView(request,doc=None): if not doc: return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc, owner=request.user) except: raise Http404 relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title']) return render_to_response('meta.html', {'doc': d, 'oid': d._id, 'related': relDocs, 'metadata': d.metadata, }, context_instance=RequestContext(request))
def main(): print "updateing pippies.relevance" pippies=Pippies.find({},['docs','len']) pippieslen=pippies.count() i=1 for pippi in pippies: if (i*100/pippieslen)!=((i-1)*100/pippieslen): if (i*100/pippieslen) % 10 == 0: sys.stdout.write("%d" % (i*100/pippieslen)) sys.stdout.flush() else: sys.stdout.write('.') sys.stdout.flush() if len(pippi['docs'])>0: Pippies.update({'_id' : pippi['_id']}, { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])), 'docslen': len(pippi['docs']),}, }) i=i+1 sys.stdout.write('\n') sys.stdout.flush() print "updateing docs.idf" docs=Docs.find({},['termcnt','docid','stemsid','rawid']) docslen=docs.count() i=1 for dd in docs: if (i*100/docslen)!=((i-1)*100/docslen): if (i*100/docslen) % 10 == 0: sys.stdout.write("%d" % (i*100/docslen)) sys.stdout.flush() else: sys.stdout.write('.') sys.stdout.flush() Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } }) i=i+1 sys.stdout.write('\n') sys.stdout.flush()
def frags(request): filtr={} template_vars={} docfilter=None cutoff=None pippifilter=None try: docfilter = ObjectId(cgi.escape(request.GET.get('doc',''))) except: pass if docfilter: filtr['doc']=docfilter try: pippifilter = ObjectId(cgi.escape(request.GET.get('pippi',''))) except: pass if pippifilter: filtr['pippi']=pippifilter else: try: cutoff = int(cgi.escape(request.GET.get('cutoff','7'))) except: pass if cutoff: filtr['l']={ '$gte': cutoff } orderBy = 'l' orderDesc = True template_vars=pager(request,Frags.find(filtr),orderBy,orderDesc) prevDoc=None template_vars['frags']=[] for frag in template_vars['data']: p=Pippi('',oid=frag['pippi']) d=Doc(oid=frag['doc']) if pippifilter: frag['txt']=diffFrag(prevDoc,frag['txt']) prevDoc=frag['txt'] template_vars['frags'].append({'_id': frag['_id'], 'pos': frag['pos'], 'txt': " ".join(frag['txt']), 'len': frag['l'], 'score': sum([d.tfidf.get(t,0) for t in p.pippi]), 'pippi': p, 'doc': d, }) template_vars['pippi']=pippifilter template_vars['doc']=docfilter if docfilter: template_vars['docTitle']=Docs.find_one({'_id': docfilter},['docid'])['docid'] if pippifilter: template_vars['pippiFilter']=1 #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi']) return render_to_response('frags.html', template_vars, context_instance=RequestContext(request))
def pippi(request,refdoc=None): if not refdoc: return render_to_response('error.html', {'error': 'specify document: %s!' % refdoc}, context_instance=RequestContext(request)) refdoc=Doc(docid=refdoc) template_vars=pager(request,Docs.find({},['_id','docid']),'docid',False) docs=sorted([(doc['docid'],doc['_id']) for doc in template_vars['data']]) docslen=Docs.count() template_vars['docs']=[{'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'job': not doc._id in refdoc.pippiDocs, 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(docid=d) for d,oid in docs if not oid == refdoc._id)] template_vars['stats']=getOverview() template_vars['refdoc']=refdoc.docid template_vars['reftitle']=refdoc.title template_vars['oid']=str(refdoc._id) template_vars['starred']=request.session.get('starred',set()) return render_to_response('pippi.html', template_vars, context_instance=RequestContext(request))
def Doc(*args, **kwargs): if 'docid' in kwargs: for (t,c,r) in DOCTYPES: if re.match(r,kwargs['docid']): return c(*args,**kwargs) if 'd' in kwargs: for (t,c,r) in DOCTYPES: if kwargs['d'].get('type','') == t or re.match(r,kwargs['d'].get('docid','')): return c(*args,**kwargs) if 'oid' in kwargs: dt=Docs.find_one({"_id": kwargs['oid']},['type'])['type'] for (t,c,r) in DOCTYPES: if dt == t: return c(*args,**kwargs) return DOC(*args,**kwargs)
def pippies(request): filtr = {} template_vars = {} docfilter = None relfilter = None cutoff = None try: cutoff = int(cgi.escape(request.GET.get('cutoff', '7'))) except: pass if cutoff: filtr['len'] = {'$gte': cutoff} try: docfilter = ObjectId(cgi.escape(request.GET.get('doc', ''))) except: pass if docfilter: filtr['docs'] = docfilter try: relfilter = int(cgi.escape(request.GET.get('relevance', ''))) except: pass if relfilter: filtr['relevance'] = relfilter # todo add sortable column headers ala http://djangosnippets.org/snippets/308/ orderBy = cgi.escape(request.GET.get('orderby', 'relevance')) orderDesc = True if '1' == cgi.escape(request.GET.get('desc', '1')) else False template_vars = pager(request, Pippies.find(filtr), orderBy, orderDesc) template_vars['pippies'] = [{ 'id': pippi['_id'], 'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]), 'docslen': len(pippi['docs']), 'relevance': pippi.get('relevance', 0), } for pippi in template_vars['data']] template_vars['doc'] = docfilter if docfilter: doc = Docs.find_one({'_id': docfilter}, ['docid', 'title']) template_vars[ 'docTitle'] = doc['title'] if 'title' in doc else doc['docid'] return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
def search(request): q = cgi.escape(request.GET.get('q', '')) if not q: return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request)) filtr = [] lang = guessLanguage(q) swords = stopmap.stopmap.get(lang, stopmap.stopmap['en']) engine = getStemmer(lang) for word in nltk.tokenize.wordpunct_tokenize(unicode(q)): # stem each word stem = engine.stem(word.encode('utf8')) if stem and stem[0] not in swords and len(stem[0]) > 1: filtr.append(stem[0]) else: filtr.append('') matches = [ x['_id'] for x in DocStems.find({'value': { '$all': filtr }}, ['_id']) ] template_vars = pager(request, Docs.find({"stemsid": { '$in': matches }}), 'docid', False) template_vars['getparams'] = request.GET.urlencode() template_vars['q'] = q template_vars['stats'] = getOverview() template_vars['starred'] = request.session.get('starred', set()) template_vars['docs'] = [{ 'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(d=d) for d in template_vars['data'])] return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
def metaView(request, doc=None): if not doc: return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc, owner=request.user) except: raise Http404 relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5)) }}, ['docid', 'title']) return render_to_response('meta.html', { 'doc': d, 'oid': d._id, 'related': relDocs, 'metadata': d.metadata, }, context_instance=RequestContext(request))
def docView(request,doc=None,cutoff=10): if request.GET.get('cutoff', 0): cutoff = int(request.GET['cutoff']) if not doc or not cutoff: return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc) except: form = UploadForm({'docid': doc}) return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request)) cont = d.body relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title']) return render_to_response('docView.html', {'doc': d, 'oid': d._id, 'user': request.user, 'content': cont, 'related': relDocs, 'cutoff': cutoff, 'cutoffs': ','.join(cutoffSL(d,cutoff)), 'len': d.getFrags(cutoff=cutoff).count()}, context_instance=RequestContext(request))
def __init__(self, docid=None, *args, **kwargs): self.__dict__['type'] = 'co-ment' if docid: hostValidator = CMTRE.search(docid) if hostValidator: if hostValidator.group(1) or hostValidator.group( 3) or hostValidator.group(5): docid = ("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8') kwargs['docid'] = docid url = "https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title'] = unescape( unicode(''.join(soup.title.findAll( text=True)))).strip().encode('utf8') dataurl = "https://%s/text%s/comments/" % ( hostValidator.group(2), hostValidator.group(4)) data = urllib2.urlopen(dataurl).read() soup = BeautifulSoup(data) kwargs[ 'raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % ( self.title, unescape( unicode( soup.find(attrs={'id': 'textcontainer' }))).encode('utf8')) kwargs['docid'] = docid super(Coment, self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid'] = docid super(Coment, self).__init__(*args, **kwargs)
def search(request): q = cgi.escape(request.GET.get('q','')) if not q: return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request)) filtr=[] lang=guessLanguage(q) swords=stopmap.stopmap.get(lang,stopmap.stopmap['en']) engine=getStemmer(lang) for word in nltk.tokenize.wordpunct_tokenize(unicode(q)): # stem each word stem=engine.stem(word.encode('utf8')) if stem and stem[0] not in swords and len(stem[0])>1: filtr.append(stem[0]) else: filtr.append('') matches=[x['_id'] for x in DocStems.find({'value': { '$all' : filtr }},['_id'])] template_vars=pager(request, Docs.find({"stemsid": { '$in': matches}}), 'docid', False) template_vars['getparams']=request.GET.urlencode() template_vars['q']=q template_vars['stats']=getOverview() template_vars['starred']=request.session.get('starred',set()) template_vars['docs']=[{'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(d=d) for d in template_vars['data'])] return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
def pippies(request): filtr={} template_vars={} docfilter=None relfilter=None cutoff=None try: cutoff = int(cgi.escape(request.GET.get('cutoff','7'))) except: pass if cutoff: filtr['len']={ '$gte': cutoff } try: docfilter = ObjectId(cgi.escape(request.GET.get('doc',''))) except: pass if docfilter: filtr['docs']=docfilter try: relfilter = int(cgi.escape(request.GET.get('relevance',''))) except: pass if relfilter: filtr['relevance']=relfilter # todo add sortable column headers ala http://djangosnippets.org/snippets/308/ orderBy = cgi.escape(request.GET.get('orderby','relevance')) orderDesc = True if '1'==cgi.escape(request.GET.get('desc','1')) else False template_vars=pager(request,Pippies.find(filtr),orderBy,orderDesc) template_vars['pippies']=[{'id': pippi['_id'], 'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]), 'docslen':len(pippi['docs']), 'relevance':pippi.get('relevance',0),} for pippi in template_vars['data']] template_vars['doc']=docfilter if docfilter: doc=Docs.find_one({'_id': docfilter},['docid', 'title']) template_vars['docTitle']=doc['title'] if 'title' in doc else doc['docid'] return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
def docView(request,doc=None,cutoff=10): if request.GET.get('cutoff', 0): cutoff = int(request.GET['cutoff']) if not doc or not cutoff: return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc) except: form = UploadForm({'docid': doc}) return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request)) tooltips={} cont = d.body relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title']) ls = [] matches = 0 for l in d.getFrags(cutoff=cutoff): if( l['l'] < cutoff): break # for unique locset - optimalization?! if l['txt'] in ls: continue ls.append(l['txt']) t = l['txt'] # for valid matches btxt = '' etxt = '' if t[0][0].isalnum(): btxt = '\W' if t[-1][-1].isalnum(): etxt = '\W' rtxt = btxt+'\s*(?:<[^>]*>\s*)*'.join([re.escape(x) for x in t])+etxt regex=re.compile(rtxt, re.I | re.M | re.U) i=0 offset = 0 #print "[!] Finding: %s\n\tPos: %s\n\t%s\n" % (' '.join(t), l['pos'], rtxt) if not l['pippi'] in tooltips: tooltips[l['pippi']]=annotatePippi(d,l,cutoff) for r in regex.finditer(cont): #print '[!] Match: %s\n\tStartpos: %d\n\tEndpos: %d' % (r.group(), r.start(), r.end()) span = (('<span class="highlight %s">') % l['pippi'], '</span>') start = r.start()+offset if btxt: start += 1 end = r.end()+offset if etxt: end -= 1 match, n = re.compile(r'((?:\s*<[^>]+>)+)', re.M | re.U).subn(r'%s\1%s' % (span[1], span[0]), cont[start:end]) cont = cont[:start]+span[0]+match+span[1]+cont[end:] offset += (n+1)*(len(span[0])+len(span[1])) matches += 1 #print '_'*60 #print '-'*120 cont=anchorArticles(cont) #print "[!] Rendering\n\tContent length: %d" % len(cont) return render_to_response('docView.html', {'doc': d, 'oid': d._id, 'user': request.user, 'content': cont, 'related': relDocs, 'cutoff': cutoff, 'cutoffs': ','.join(cutoffSL(d,cutoff)), 'len': len(ls), 'tooltips': '\n'.join(tooltips.values()), 'matches': matches}, context_instance=RequestContext(request))
def listDocs(request): template_vars=pager(request,Docs.find(sort=[('docid',pymongo.DESCENDING)]),'docid',False) template_vars['title']='Complete Corpus of pippi longstrings' return _listDocs(request, template_vars)
def save(self): tmp=[(i,self.__dict__[i]) for i in self.fieldMap if i in self.__dict__] for i in tmp: del self.__dict__[i[0]] self.__dict__['_id']=Docs.save(self.__dict__) for (i,val) in tmp: self.__dict__[i]=val
def frags(request): filtr = {} template_vars = {} docfilter = None cutoff = None pippifilter = None try: docfilter = ObjectId(cgi.escape(request.GET.get('doc', ''))) except: pass if docfilter: filtr['doc'] = docfilter try: pippifilter = ObjectId(cgi.escape(request.GET.get('pippi', ''))) except: pass if pippifilter: filtr['pippi'] = pippifilter else: try: cutoff = int(cgi.escape(request.GET.get('cutoff', '7'))) except: pass if cutoff: filtr['l'] = {'$gte': cutoff} orderBy = 'l' orderDesc = True template_vars = pager(request, Frags.find(filtr), orderBy, orderDesc) prevDoc = None template_vars['frags'] = [] for frag in template_vars['data']: p = Pippi('', oid=frag['pippi']) d = Doc(oid=frag['doc']) if pippifilter: frag['txt'] = diffFrag(prevDoc, frag['txt']) prevDoc = frag['txt'] template_vars['frags'].append({ '_id': frag['_id'], 'pos': frag['pos'], 'txt': " ".join(frag['txt']), 'len': frag['l'], 'score': sum([d.tfidf.get(t, 0) for t in p.pippi]), 'pippi': p, 'doc': d, }) template_vars['pippi'] = pippifilter template_vars['doc'] = docfilter if docfilter: template_vars['docTitle'] = Docs.find_one({'_id': docfilter}, ['docid'])['docid'] if pippifilter: template_vars[ 'pippiFilter'] = 1 #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi']) return render_to_response('frags.html', template_vars, context_instance=RequestContext(request))
def listDocs(request): template_vars = pager(request, Docs.find(sort=[('docid', pymongo.DESCENDING)]), 'docid', False) template_vars['title'] = 'Complete Corpus of pippi longstrings' return _listDocs(request, template_vars)