示例#1
0
 def addDocs(self,d1,d2):
     Docs.update({"_id" : d1._id},
                 { '$push' : { 'pippiDocs' : d2._id },
                   '$inc' : { 'pippiDocsLen' : 1 }})
     Docs.update({"_id" : d2._id},
                 { '$push' : { 'pippiDocs' : d1._id },
                   '$inc' : { 'pippiDocsLen' : 1 }})
示例#2
0
def pippi(request, refdoc=None):
    if not refdoc:
        return render_to_response('error.html',
                                  {'error': 'specify document: %s!' % refdoc},
                                  context_instance=RequestContext(request))
    refdoc = Doc(docid=refdoc)
    template_vars = pager(request, Docs.find({}, ['_id', 'docid']), 'docid',
                          False)
    docs = sorted([(doc['docid'], doc['_id'])
                   for doc in template_vars['data']])
    docslen = Docs.count()
    template_vars['docs'] = [{
        'id': doc.docid,
        'oid': str(doc._id),
        'indexed': doc.pippiDocsLen,
        'title': doc.title,
        'frags': doc.getFrags().count(),
        'pippies': len(doc.pippies),
        'job': not doc._id in refdoc.pippiDocs,
        'type': doc.type,
        'docs': len(doc.getRelatedDocIds()),
        'tags': doc.autoTags(25)
    } for doc in (Doc(docid=d) for d, oid in docs if not oid == refdoc._id)]
    template_vars['stats'] = getOverview()
    template_vars['refdoc'] = refdoc.docid
    template_vars['reftitle'] = refdoc.title
    template_vars['oid'] = str(refdoc._id)
    template_vars['starred'] = request.session.get('starred', set())
    return render_to_response('pippi.html',
                              template_vars,
                              context_instance=RequestContext(request))
示例#3
0
文件: doc.py 项目: nifgraup/pippi
 def __init__(self,raw=None,docid=None,oid=None,d=None):
     if oid:
         # get by mongo oid
         d=Docs.find_one({"_id": oid})
     elif docid:
         # get by docid
         d=Docs.find_one({"docid": docid})
     if d:
         # load the values
         self.__dict__.update(d)
     elif raw:
         # create a new document
         self.__dict__.update({
             'docid' : docid,
             'pippies' : [],
             'pippiDocs' : [],
             'pippiDocsLen' : 0,
             'rawid' : None,
             })
         if not 'type' in self.__dict__:
             self.__dict__['type']='raw'
         if not 'metadata' in self.__dict__:
             self.__dict__['metadata']={}
         if raw:
             self.raw=raw
             self.lang=guessLanguage(" ".join(self.text))
         self.save()
     else:
         raise KeyError('empty docid')
示例#4
0
 def __init__(self,raw=None,docid=None,oid=None,d=None,owner=None):
     if oid:
         # get by mongo oid
         d=Docs.find_one({"_id": oid})
     elif docid:
         # get by docid
         d=Docs.find_one({"docid": docid})
     if d:
         # load the values
         self.__dict__.update(d)
     elif raw:
         # create a new document
         self.__dict__.update({
             'docid' : docid,
             'owner': unicode(owner),
             'pippies' : [],
             'pippiDocs' : [],
             'pippiDocsLen' : 0,
             'rawid' : None,
             'title': docid,
             })
         if not 'type' in self.__dict__:
             self.__dict__['type']='raw'
         if not 'metadata' in self.__dict__:
             self.__dict__['metadata']={}
         self.raw=raw
         self.__dict__['lang']=guessLanguage(" ".join(self.text))
         self.stems # for caching
         self.save()
     else:
         raise KeyError('empty docid')
示例#5
0
 def delete(self):
     for key, col in self.fieldMap.items():
         try:
             col.remove({'_id': self.__dict__["%sid" % key]})
         except:
             pass
     # TODO also remove pippies and other stuff cached after pippring
     Docs.remove({'_id': self.__dict__['_id']})
示例#6
0
文件: views.py 项目: stef/le-n-x
def getOverview():
    stats=[]
    stats.append({'title': 'Total documents',
                  'value': Docs.count(),
                  'text': "%s Documents" % Docs.count()})
    stats.append({'title':
                  'Total Pippies',
                  'value': Pippies.count(),
                  'text': "with %s Pippies" % Pippies.count()})
    stats.append({'title': 'Locations',
                  'value': Frags.count(),
                  'text': "in %s Locations" % Frags.count()})
    return stats
示例#7
0
 def save(self,d1,d2,pkt):
     # todo new code to directly addtoset mongo-style
     if not pkt: return
     pippi=Pippi(pkt['pippi'])
     Docs.update({'_id': d1._id},
                 { '$addToSet' : { 'pippies' : pippi._id } })
     Docs.update({'_id': d2._id},
                 { '$addToSet' : { 'pippies' : pippi._id } })
     Pippies.update({'_id' : pippi._id},
                    {'$addToSet': { 'docs' : { '$each' : [d for d in [d1._id, d2._id]]}},
                     '$inc' : { 'docslen' : 2 }})
     [Frags.save({'pos': p['pos'], 'txt': p['txt'], 'l': pkt['l'], 'doc': d, 'pippi': pippi._id})
                 for (d,p) in
                 [(d1._id, p) for p in pkt['d1ps']]+[(d2._id, p) for p in pkt['d2ps']]]
     return pkt
示例#8
0
文件: views.py 项目: stef/le-n-x
def filterDocs(request):
    q=request.GET.get('q')
    query={}
    if q:
        query={'title': re.compile(q, re.I)}
    if request.GET.get('starred')=='true':
        query['_id']={ '$in': [ObjectId(x)
                               for x in request.session.get('starred',())] }
    if request.GET.get('mine')=='true':
        query['owner']=unicode(request.user)
    res=pager(request,Docs.find(query, sort=[('_id',pymongo.DESCENDING)]),'docid',False)
    starred=request.session.get('starred',set())
    res['docs']=[{'id': doc.docid,
                  'starred': u'\u2605' if str(doc._id) in starred else u'\u2606',
                  'starclass': 'starred' if str(doc._id) in starred else '',
                  'title': doc.title,
                  'meta': doc.metadata,
                  'oid': str(doc._id),
                  'indexed': doc.pippiDocs,
                  'pippies': len(doc.pippies),
                  'type': doc.type,
                  'tags': doc.autoTags(25),
                  }
                 for doc in (Doc(d=d) for d in res['data'])]
    return HttpResponse(jdump(res),mimetype="application/json")
示例#9
0
    def __init__(self, docid=None, *args,**kwargs):
        self.__dict__['type'] = 'etherpad'
        if docid:
            hostValidator = PADRE.search(docid)
            if hostValidator:
                if hostValidator.group(2) and hostValidator.group(3):
                    docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8')
                    kwargs['docid']=docid
                url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')

                    doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8'))
                    raw=str(tidy.parseString(doc, **{'output_xhtml' : 1,
                                                             'add_xml_decl' : 0,
                                                             'indent' : 0,
                                                             'tidy_mark' : 0,
                                                             'doctype' : "strict",
                                                             'wrap' : 0}))
                    kwargs['raw'] = raw
                    kwargs['docid']=docid
                    super(Etherpad,self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid']=docid
        super(Etherpad,self).__init__(*args, **kwargs)
示例#10
0
def filterDocs(request):
    q = request.GET.get('q')
    query = {}
    if q:
        query = {'title': re.compile(q, re.I)}
    if request.GET.get('starred') == 'true':
        query['_id'] = {
            '$in': [ObjectId(x) for x in request.session.get('starred', ())]
        }
    if request.GET.get('mine') == 'true':
        query['owner'] = unicode(request.user)
    res = pager(request, Docs.find(query, sort=[('_id', pymongo.DESCENDING)]),
                'docid', False)
    starred = request.session.get('starred', set())
    res['docs'] = [{
        'id': doc.docid,
        'starred': u'\u2605' if str(doc._id) in starred else u'\u2606',
        'starclass': 'starred' if str(doc._id) in starred else '',
        'title': doc.title,
        'meta': doc.metadata,
        'oid': str(doc._id),
        'indexed': doc.pippiDocs,
        'pippies': len(doc.pippies),
        'type': doc.type,
        'tags': doc.autoTags(25),
    } for doc in (Doc(d=d) for d in res['data'])]
    return HttpResponse(jdump(res), mimetype="application/json")
示例#11
0
文件: cmt.py 项目: asciimoo/le-n-x
    def __init__(self, docid=None, *args,**kwargs):
        self.__dict__['type'] = 'co-ment'
        if docid:
            hostValidator = CMTRE.search(docid)
            if hostValidator:
                if hostValidator.group(1) or hostValidator.group(3) or hostValidator.group(5):
                    docid=("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8')
                    kwargs['docid']=docid
                url="https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')

                    dataurl = "https://%s/text%s/comments/" % (hostValidator.group(2), hostValidator.group(4))
                    data = urllib2.urlopen(dataurl).read()
                    soup = BeautifulSoup(data)

                    kwargs['raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (self.title, unescape(unicode(soup.find(attrs={'id' : 'textcontainer'}))).encode('utf8'))
                    kwargs['docid']=docid
                    super(Coment,self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid']=docid
        super(Coment,self).__init__(*args, **kwargs)
示例#12
0
def docView(request, doc=None, cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response(
            'error.html', {'error': 'Missing document or wrong cutoff!'},
            context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404
    cont = d.body
    relDocs = Docs.find(
        {'_id': {
            '$in': list(d.getRelatedDocIds(cutoff=cutoff))
        }}, ['docid', 'title'])
    return render_to_response('docView.html', {
        'doc': d,
        'oid': d._id,
        'user': request.user,
        'content': cont,
        'related': relDocs,
        'cutoff': cutoff,
        'cutoffs': ','.join(cutoffSL(d, cutoff)),
        'len': d.getFrags(cutoff=cutoff).count()
    },
                              context_instance=RequestContext(request))
示例#13
0
文件: views.py 项目: stef/le-n-x
def starred(request):
    template_vars=pager(request,
                        Docs.find({'_id' :
                                   { '$in': [ObjectId(x)
                                             for x in request.session.get('starred',())] }},
                                  sort=[('docid',pymongo.ASCENDING)]),
                        'docid',False)
    template_vars['title']='Your starred documents'
    return _listDocs(request, template_vars)
示例#14
0
def getOverview():
    stats = []
    stats.append({
        'title': 'Total documents',
        'value': Docs.count(),
        'text': "%s Documents" % Docs.count()
    })
    stats.append({
        'title': 'Total Pippies',
        'value': Pippies.count(),
        'text': "with %s Pippies" % Pippies.count()
    })
    stats.append({
        'title': 'Locations',
        'value': Frags.count(),
        'text': "in %s Locations" % Frags.count()
    })
    return stats
示例#15
0
 def __init__(self, docid=None, *args,**kwargs):
     setTitle=False
     if docid:
         alias=re.match(SHORTCUTRE,docid)
         if alias:
             self.__dict__['sector'] = '3'
             self.__dict__['year'] = alias.group(2)
             self.__dict__['doctype'] = SHORTCUTMAP[alias.group(1)]
             self.__dict__['refno'] = "%04d" % int(alias.group(3))
             self.__dict__['lang'] = 'EN' # assuming default
         else:
             (code,lang)=docid.split(":")[1:3]
             st=7 if code[6].isalpha() else 6
             self.__dict__['sector'] = code[0]
             self.__dict__['year'] = code[1:5]
             self.__dict__['doctype'] = code[5:st]
             self.__dict__['refno'] = code[st:]
             self.__dict__['lang'] = lang
         self.__dict__['type'] = 'eurlex'
         kwargs['docid']=self.docid
         if not Docs.find_one({"docid": self.docid}):
             retries=4
             while True:
                 raw=CACHE.fetchUrl(EURLEXURL+self.docid+":HTML")
                 soup=BeautifulSoup(raw)
                 # TODO handle empty or invalid celex ids - also handle other languages besides english!!!
                 # <TITLE>Request Error</TITLE>
                 # <h1>The parameters of the link are incorrect.</h1>
                 if soup.title and soup.title.string == "Request Error":
                     if retries>0:
                         retries=retries-1
                         continue
                     else:
                         raise ValueError, "Request Error"
                 if soup.h1 and soup.h1.string == 'The parameters of the link are incorrect.':
                     if retries>0:
                         retries=retries-1
                         continue
                     else:
                         raise ValueError, "Parameter Error"
                 # no errors found, continue, nothing to see here
                 break
             # > /* There is no English version of this document available since it was not included in the English Special Edition.
             content=soup.find(id='TexteOnly')
             if (content and
                 content.findAll('p') and
                 len(content.findAll('p'))>1 and
                 'string' in dir(content.findAll('p')[1]) and
                 content.findAll('p')[1].string.strip().startswith('/* There is no English version of this document available since it was not included in the English Special Edition.')):
                 raise ValueError, "Language Error"
             kwargs['raw']=anchorArticles(raw)
             self.__dict__['metadata'] = self.extractMetadata()
             setTitle=True
     super(Eurlex,self).__init__(*args, **kwargs)
     if setTitle:
         self.__dict__['title']=self._gettitle()
         self.save()
示例#16
0
文件: views.py 项目: asciimoo/le-n-x
def starred(request):
    template_vars=pager(request,Docs.find({'_id' : { '$in': [ObjectId(x) for x in request.session.get('starred',())] }},['_id','docid']),'docid',False)
    docs=[(doc['docid'],doc['_id']) for doc in template_vars['data']]
    docslen=Docs.count()
    template_vars['docs']=[{'id': doc.docid,
                            'oid': str(doc._id),
                            'indexed': doc.pippiDocsLen,
                            'title': doc.title,
                            'frags': doc.getFrags().count(),
                            'pippies': len(doc.pippies),
                            'type': doc.type,
                            'docs': len(doc.getRelatedDocIds()),
                            'tags': doc.autoTags(25) }
                           for doc in (Doc(docid=d) for d,oid in docs)]
    template_vars['stats']=getOverview()
    template_vars['starred']=request.session.get('starred',set())
    template_vars['title']='Your starred documents'
    return render_to_response('corpus.html', template_vars, context_instance=RequestContext(request))
示例#17
0
def starred(request):
    template_vars = pager(
        request,
        Docs.find(
            {
                '_id': {
                    '$in':
                    [ObjectId(x) for x in request.session.get('starred', ())]
                }
            },
            sort=[('docid', pymongo.ASCENDING)]), 'docid', False)
    template_vars['title'] = 'Your starred documents'
    return _listDocs(request, template_vars)
示例#18
0
文件: views.py 项目: asciimoo/le-n-x
def metaView(request,doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))

    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title'])
    return render_to_response('meta.html', {'doc': d,
                                            'related': relDocs,
                                            'metadata': d.metadata,
                                            }, context_instance=RequestContext(request))
示例#19
0
文件: views.py 项目: stef/le-n-x
def metaView(request,doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404

    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title'])
    return render_to_response('meta.html', {'doc': d,
                                            'oid': d._id,
                                            'related': relDocs,
                                            'metadata': d.metadata,
                                            }, context_instance=RequestContext(request))
示例#20
0
def main():
    print "updateing pippies.relevance"
    pippies=Pippies.find({},['docs','len'])
    pippieslen=pippies.count()
    i=1
    for pippi in pippies:
        if (i*100/pippieslen)!=((i-1)*100/pippieslen):
            if (i*100/pippieslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/pippieslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        if len(pippi['docs'])>0:
                Pippies.update({'_id' : pippi['_id']},
                               { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])),
                                           'docslen': len(pippi['docs']),}, })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()

    print "updateing docs.idf"
    docs=Docs.find({},['termcnt','docid','stemsid','rawid'])
    docslen=docs.count()
    i=1
    for dd in docs:
        if (i*100/docslen)!=((i-1)*100/docslen):
            if (i*100/docslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/docslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()
示例#21
0
文件: views.py 项目: asciimoo/le-n-x
def frags(request):
    filtr={}
    template_vars={}
    docfilter=None
    cutoff=None
    pippifilter=None
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc','')))
    except:
        pass
    if docfilter:
        filtr['doc']=docfilter
    try:
        pippifilter = ObjectId(cgi.escape(request.GET.get('pippi','')))
    except:
        pass
    if pippifilter:
        filtr['pippi']=pippifilter
    else:
        try:
            cutoff = int(cgi.escape(request.GET.get('cutoff','7')))
        except:
            pass
    if cutoff: filtr['l']={ '$gte': cutoff }
    orderBy = 'l'
    orderDesc = True
    template_vars=pager(request,Frags.find(filtr),orderBy,orderDesc)
    prevDoc=None
    template_vars['frags']=[]
    for frag in template_vars['data']:
        p=Pippi('',oid=frag['pippi'])
        d=Doc(oid=frag['doc'])
        if pippifilter:
            frag['txt']=diffFrag(prevDoc,frag['txt'])
            prevDoc=frag['txt']
        template_vars['frags'].append({'_id': frag['_id'],
                                       'pos': frag['pos'],
                                       'txt': " ".join(frag['txt']),
                                       'len': frag['l'],
                                       'score': sum([d.tfidf.get(t,0) for t in p.pippi]),
                                       'pippi': p,
                                       'doc': d,
                                       })

    template_vars['pippi']=pippifilter
    template_vars['doc']=docfilter
    if docfilter: template_vars['docTitle']=Docs.find_one({'_id': docfilter},['docid'])['docid']
    if pippifilter: template_vars['pippiFilter']=1 #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi'])
    return render_to_response('frags.html', template_vars, context_instance=RequestContext(request))
示例#22
0
文件: views.py 项目: asciimoo/le-n-x
def pippi(request,refdoc=None):
    if not refdoc:
        return render_to_response('error.html', {'error': 'specify document: %s!' % refdoc}, context_instance=RequestContext(request))
    refdoc=Doc(docid=refdoc)
    template_vars=pager(request,Docs.find({},['_id','docid']),'docid',False)
    docs=sorted([(doc['docid'],doc['_id']) for doc in template_vars['data']])
    docslen=Docs.count()
    template_vars['docs']=[{'id': doc.docid,
                            'oid': str(doc._id),
                            'indexed': doc.pippiDocsLen,
                            'title': doc.title,
                            'frags': doc.getFrags().count(),
                            'pippies': len(doc.pippies),
                            'job': not doc._id in refdoc.pippiDocs,
                            'type': doc.type,
                            'docs': len(doc.getRelatedDocIds()),
                            'tags': doc.autoTags(25) }
                           for doc in (Doc(docid=d) for d,oid in docs if not oid == refdoc._id)]
    template_vars['stats']=getOverview()
    template_vars['refdoc']=refdoc.docid
    template_vars['reftitle']=refdoc.title
    template_vars['oid']=str(refdoc._id)
    template_vars['starred']=request.session.get('starred',set())
    return render_to_response('pippi.html', template_vars, context_instance=RequestContext(request))
示例#23
0
文件: doc.py 项目: nifgraup/pippi
def Doc(*args, **kwargs):
    if 'docid' in kwargs:
        for (t,c,r) in DOCTYPES:
            if re.match(r,kwargs['docid']):
                return c(*args,**kwargs)
    if 'd' in kwargs:
        for (t,c,r) in DOCTYPES:
            if kwargs['d'].get('type','') == t or re.match(r,kwargs['d'].get('docid','')):
                return c(*args,**kwargs)
    if 'oid' in kwargs:
        dt=Docs.find_one({"_id": kwargs['oid']},['type'])['type']
        for (t,c,r) in DOCTYPES:
            if dt == t:
                return c(*args,**kwargs)
    return DOC(*args,**kwargs)
示例#24
0
def Doc(*args, **kwargs):
    if 'docid' in kwargs:
        for (t,c,r) in DOCTYPES:
            if re.match(r,kwargs['docid']):
                return c(*args,**kwargs)
    if 'd' in kwargs:
        for (t,c,r) in DOCTYPES:
            if kwargs['d'].get('type','') == t or re.match(r,kwargs['d'].get('docid','')):
                return c(*args,**kwargs)
    if 'oid' in kwargs:
        dt=Docs.find_one({"_id": kwargs['oid']},['type'])['type']
        for (t,c,r) in DOCTYPES:
            if dt == t:
                return c(*args,**kwargs)
    return DOC(*args,**kwargs)
示例#25
0
def pippies(request):
    filtr = {}
    template_vars = {}
    docfilter = None
    relfilter = None
    cutoff = None
    try:
        cutoff = int(cgi.escape(request.GET.get('cutoff', '7')))
    except:
        pass
    if cutoff: filtr['len'] = {'$gte': cutoff}
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc', '')))
    except:
        pass
    if docfilter:
        filtr['docs'] = docfilter
    try:
        relfilter = int(cgi.escape(request.GET.get('relevance', '')))
    except:
        pass
    if relfilter: filtr['relevance'] = relfilter
    # todo add sortable column headers ala http://djangosnippets.org/snippets/308/
    orderBy = cgi.escape(request.GET.get('orderby', 'relevance'))
    orderDesc = True if '1' == cgi.escape(request.GET.get('desc',
                                                          '1')) else False
    template_vars = pager(request, Pippies.find(filtr), orderBy, orderDesc)
    template_vars['pippies'] = [{
        'id':
        pippi['_id'],
        'pippi':
        ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]),
        'docslen':
        len(pippi['docs']),
        'relevance':
        pippi.get('relevance', 0),
    } for pippi in template_vars['data']]
    template_vars['doc'] = docfilter
    if docfilter:
        doc = Docs.find_one({'_id': docfilter}, ['docid', 'title'])
        template_vars[
            'docTitle'] = doc['title'] if 'title' in doc else doc['docid']
    return render_to_response('pippies.html',
                              template_vars,
                              context_instance=RequestContext(request))
示例#26
0
def search(request):
    q = cgi.escape(request.GET.get('q', ''))
    if not q:
        return render_to_response('error.html',
                                  {'error': 'Missing search query!'},
                                  context_instance=RequestContext(request))

    filtr = []
    lang = guessLanguage(q)
    swords = stopmap.stopmap.get(lang, stopmap.stopmap['en'])
    engine = getStemmer(lang)
    for word in nltk.tokenize.wordpunct_tokenize(unicode(q)):
        # stem each word
        stem = engine.stem(word.encode('utf8'))
        if stem and stem[0] not in swords and len(stem[0]) > 1:
            filtr.append(stem[0])
        else:
            filtr.append('')
    matches = [
        x['_id'] for x in DocStems.find({'value': {
            '$all': filtr
        }}, ['_id'])
    ]
    template_vars = pager(request, Docs.find({"stemsid": {
        '$in': matches
    }}), 'docid', False)
    template_vars['getparams'] = request.GET.urlencode()
    template_vars['q'] = q
    template_vars['stats'] = getOverview()
    template_vars['starred'] = request.session.get('starred', set())
    template_vars['docs'] = [{
        'id': doc.docid,
        'oid': str(doc._id),
        'indexed': doc.pippiDocsLen,
        'title': doc.title,
        'frags': doc.getFrags().count(),
        'pippies': len(doc.pippies),
        'type': doc.type,
        'docs': len(doc.getRelatedDocIds()),
        'tags': doc.autoTags(25)
    } for doc in (Doc(d=d) for d in template_vars['data'])]
    return render_to_response('search.html',
                              template_vars,
                              context_instance=RequestContext(request))
示例#27
0
def metaView(request, doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'},
                                  context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404

    relDocs = Docs.find({'_id': {
        '$in': list(d.getRelatedDocIds(cutoff=5))
    }}, ['docid', 'title'])
    return render_to_response('meta.html', {
        'doc': d,
        'oid': d._id,
        'related': relDocs,
        'metadata': d.metadata,
    },
                              context_instance=RequestContext(request))
示例#28
0
文件: views.py 项目: nifgraup/pippi
def docView(request,doc=None,cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))
    cont = d.body
    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title'])
    return render_to_response('docView.html', {'doc': d,
                                               'oid': d._id,
                                               'user': request.user,
                                               'content': cont,
                                               'related': relDocs,
                                               'cutoff': cutoff,
                                               'cutoffs': ','.join(cutoffSL(d,cutoff)),
                                               'len': d.getFrags(cutoff=cutoff).count()}, context_instance=RequestContext(request))
示例#29
0
    def __init__(self, docid=None, *args, **kwargs):
        self.__dict__['type'] = 'co-ment'
        if docid:
            hostValidator = CMTRE.search(docid)
            if hostValidator:
                if hostValidator.group(1) or hostValidator.group(
                        3) or hostValidator.group(5):
                    docid = ("%s%s" % (hostValidator.group(2),
                                       hostValidator.group(4))).encode('utf8')
                    kwargs['docid'] = docid
                url = "https://%s/text/%s/view/" % (hostValidator.group(2),
                                                    hostValidator.group(4))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title'] = unescape(
                        unicode(''.join(soup.title.findAll(
                            text=True)))).strip().encode('utf8')

                    dataurl = "https://%s/text%s/comments/" % (
                        hostValidator.group(2), hostValidator.group(4))
                    data = urllib2.urlopen(dataurl).read()
                    soup = BeautifulSoup(data)

                    kwargs[
                        'raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (
                            self.title,
                            unescape(
                                unicode(
                                    soup.find(attrs={'id': 'textcontainer'
                                                     }))).encode('utf8'))
                    kwargs['docid'] = docid
                    super(Coment, self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid'] = docid
        super(Coment, self).__init__(*args, **kwargs)
示例#30
0
文件: views.py 项目: stef/le-n-x
def search(request):
    q = cgi.escape(request.GET.get('q',''))
    if not q:
        return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request))

    filtr=[]
    lang=guessLanguage(q)
    swords=stopmap.stopmap.get(lang,stopmap.stopmap['en'])
    engine=getStemmer(lang)
    for word in nltk.tokenize.wordpunct_tokenize(unicode(q)):
        # stem each word
        stem=engine.stem(word.encode('utf8'))
        if stem and stem[0] not in swords and len(stem[0])>1:
            filtr.append(stem[0])
        else:
            filtr.append('')
    matches=[x['_id'] for x in DocStems.find({'value': { '$all' : filtr }},['_id'])]
    template_vars=pager(request,
                        Docs.find({"stemsid": { '$in': matches}}),
                        'docid',
                        False)
    template_vars['getparams']=request.GET.urlencode()
    template_vars['q']=q
    template_vars['stats']=getOverview()
    template_vars['starred']=request.session.get('starred',set())
    template_vars['docs']=[{'id': doc.docid,
                            'oid': str(doc._id),
                            'indexed': doc.pippiDocsLen,
                            'title': doc.title,
                            'frags': doc.getFrags().count(),
                            'pippies': len(doc.pippies),
                            'type': doc.type,
                            'docs': len(doc.getRelatedDocIds()),
                            'tags': doc.autoTags(25) }
                           for doc in (Doc(d=d) for d in template_vars['data'])]
    return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
示例#31
0
文件: views.py 项目: asciimoo/le-n-x
def pippies(request):
    filtr={}
    template_vars={}
    docfilter=None
    relfilter=None
    cutoff=None
    try:
        cutoff = int(cgi.escape(request.GET.get('cutoff','7')))
    except:
        pass
    if cutoff: filtr['len']={ '$gte': cutoff }
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc','')))
    except:
        pass
    if docfilter:
        filtr['docs']=docfilter
    try:
        relfilter =  int(cgi.escape(request.GET.get('relevance','')))
    except:
        pass
    if relfilter: filtr['relevance']=relfilter
    # todo add sortable column headers ala http://djangosnippets.org/snippets/308/
    orderBy = cgi.escape(request.GET.get('orderby','relevance'))
    orderDesc = True if '1'==cgi.escape(request.GET.get('desc','1')) else False
    template_vars=pager(request,Pippies.find(filtr),orderBy,orderDesc)
    template_vars['pippies']=[{'id': pippi['_id'],
                               'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]),
                               'docslen':len(pippi['docs']),
                               'relevance':pippi.get('relevance',0),}
                               for pippi in template_vars['data']]
    template_vars['doc']=docfilter
    if docfilter:
        doc=Docs.find_one({'_id': docfilter},['docid', 'title'])
        template_vars['docTitle']=doc['title'] if 'title' in doc else doc['docid']
    return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
示例#32
0
文件: views.py 项目: asciimoo/le-n-x
def docView(request,doc=None,cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))
    tooltips={}
    cont = d.body
    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title'])
    ls = []
    matches = 0
    for l in d.getFrags(cutoff=cutoff):
        if( l['l'] < cutoff): break
        # for unique locset - optimalization?!
        if l['txt'] in ls:
            continue
        ls.append(l['txt'])
        t = l['txt']
        # for valid matches
        btxt = ''
        etxt = ''
        if t[0][0].isalnum():
            btxt = '\W'
        if t[-1][-1].isalnum():
            etxt = '\W'
        rtxt = btxt+'\s*(?:<[^>]*>\s*)*'.join([re.escape(x) for x in t])+etxt
        regex=re.compile(rtxt, re.I | re.M | re.U)
        i=0
        offset = 0
        #print "[!] Finding: %s\n\tPos: %s\n\t%s\n" % (' '.join(t), l['pos'], rtxt)
        if not l['pippi'] in tooltips:
            tooltips[l['pippi']]=annotatePippi(d,l,cutoff)
        for r in regex.finditer(cont):
            #print '[!] Match: %s\n\tStartpos: %d\n\tEndpos: %d' % (r.group(), r.start(), r.end())
            span = (('<span class="highlight %s">') % l['pippi'], '</span>')
            start = r.start()+offset
            if btxt:
                start += 1
            end = r.end()+offset
            if etxt:
                end -= 1
            match, n = re.compile(r'((?:\s*<[^>]+>)+)', re.M | re.U).subn(r'%s\1%s' % (span[1], span[0]), cont[start:end])
            cont = cont[:start]+span[0]+match+span[1]+cont[end:]
            offset += (n+1)*(len(span[0])+len(span[1]))
            matches += 1
            #print '_'*60
        #print '-'*120
    cont=anchorArticles(cont)
    #print "[!] Rendering\n\tContent length: %d" % len(cont)
    return render_to_response('docView.html', {'doc': d,
                                               'oid': d._id,
                                               'user': request.user,
                                               'content': cont,
                                               'related': relDocs,
                                               'cutoff': cutoff,
                                               'cutoffs': ','.join(cutoffSL(d,cutoff)),
                                               'len': len(ls),
                                               'tooltips': '\n'.join(tooltips.values()),
                                               'matches': matches}, context_instance=RequestContext(request))
示例#33
0
文件: views.py 项目: stef/le-n-x
def listDocs(request):
    template_vars=pager(request,Docs.find(sort=[('docid',pymongo.DESCENDING)]),'docid',False)
    template_vars['title']='Complete Corpus of pippi longstrings'
    return _listDocs(request, template_vars)
示例#34
0
文件: doc.py 项目: nifgraup/pippi
 def save(self):
     tmp=[(i,self.__dict__[i]) for i in self.fieldMap if i in self.__dict__]
     for i in tmp: del self.__dict__[i[0]]
     self.__dict__['_id']=Docs.save(self.__dict__)
     for (i,val) in tmp: self.__dict__[i]=val
示例#35
0
def frags(request):
    filtr = {}
    template_vars = {}
    docfilter = None
    cutoff = None
    pippifilter = None
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc', '')))
    except:
        pass
    if docfilter:
        filtr['doc'] = docfilter
    try:
        pippifilter = ObjectId(cgi.escape(request.GET.get('pippi', '')))
    except:
        pass
    if pippifilter:
        filtr['pippi'] = pippifilter
    else:
        try:
            cutoff = int(cgi.escape(request.GET.get('cutoff', '7')))
        except:
            pass
    if cutoff: filtr['l'] = {'$gte': cutoff}
    orderBy = 'l'
    orderDesc = True
    template_vars = pager(request, Frags.find(filtr), orderBy, orderDesc)
    prevDoc = None
    template_vars['frags'] = []
    for frag in template_vars['data']:
        p = Pippi('', oid=frag['pippi'])
        d = Doc(oid=frag['doc'])
        if pippifilter:
            frag['txt'] = diffFrag(prevDoc, frag['txt'])
            prevDoc = frag['txt']
        template_vars['frags'].append({
            '_id':
            frag['_id'],
            'pos':
            frag['pos'],
            'txt':
            " ".join(frag['txt']),
            'len':
            frag['l'],
            'score':
            sum([d.tfidf.get(t, 0) for t in p.pippi]),
            'pippi':
            p,
            'doc':
            d,
        })

    template_vars['pippi'] = pippifilter
    template_vars['doc'] = docfilter
    if docfilter:
        template_vars['docTitle'] = Docs.find_one({'_id': docfilter},
                                                  ['docid'])['docid']
    if pippifilter:
        template_vars[
            'pippiFilter'] = 1  #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi'])
    return render_to_response('frags.html',
                              template_vars,
                              context_instance=RequestContext(request))
示例#36
0
 def save(self):
     tmp=[(i,self.__dict__[i]) for i in self.fieldMap if i in self.__dict__]
     for i in tmp: del self.__dict__[i[0]]
     self.__dict__['_id']=Docs.save(self.__dict__)
     for (i,val) in tmp: self.__dict__[i]=val
示例#37
0
def listDocs(request):
    template_vars = pager(request,
                          Docs.find(sort=[('docid', pymongo.DESCENDING)]),
                          'docid', False)
    template_vars['title'] = 'Complete Corpus of pippi longstrings'
    return _listDocs(request, template_vars)