def __init__(self,raw=None,docid=None,oid=None,d=None): if oid: # get by mongo oid d=Docs.find_one({"_id": oid}) elif docid: # get by docid d=Docs.find_one({"docid": docid}) if d: # load the values self.__dict__.update(d) elif raw: # create a new document self.__dict__.update({ 'docid' : docid, 'pippies' : [], 'pippiDocs' : [], 'pippiDocsLen' : 0, 'rawid' : None, }) if not 'type' in self.__dict__: self.__dict__['type']='raw' if not 'metadata' in self.__dict__: self.__dict__['metadata']={} if raw: self.raw=raw self.lang=guessLanguage(" ".join(self.text)) self.save() else: raise KeyError('empty docid')
def __init__(self,raw=None,docid=None,oid=None,d=None,owner=None): if oid: # get by mongo oid d=Docs.find_one({"_id": oid}) elif docid: # get by docid d=Docs.find_one({"docid": docid}) if d: # load the values self.__dict__.update(d) elif raw: # create a new document self.__dict__.update({ 'docid' : docid, 'owner': unicode(owner), 'pippies' : [], 'pippiDocs' : [], 'pippiDocsLen' : 0, 'rawid' : None, 'title': docid, }) if not 'type' in self.__dict__: self.__dict__['type']='raw' if not 'metadata' in self.__dict__: self.__dict__['metadata']={} self.raw=raw self.__dict__['lang']=guessLanguage(" ".join(self.text)) self.stems # for caching self.save() else: raise KeyError('empty docid')
def __init__(self, docid=None, *args,**kwargs): self.__dict__['type'] = 'etherpad' if docid: hostValidator = PADRE.search(docid) if hostValidator: if hostValidator.group(2) and hostValidator.group(3): docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8') kwargs['docid']=docid url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8') doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8')) raw=str(tidy.parseString(doc, **{'output_xhtml' : 1, 'add_xml_decl' : 0, 'indent' : 0, 'tidy_mark' : 0, 'doctype' : "strict", 'wrap' : 0})) kwargs['raw'] = raw kwargs['docid']=docid super(Etherpad,self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid']=docid super(Etherpad,self).__init__(*args, **kwargs)
def __init__(self, docid=None, *args,**kwargs): self.__dict__['type'] = 'co-ment' if docid: hostValidator = CMTRE.search(docid) if hostValidator: if hostValidator.group(1) or hostValidator.group(3) or hostValidator.group(5): docid=("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8') kwargs['docid']=docid url="https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8') dataurl = "https://%s/text%s/comments/" % (hostValidator.group(2), hostValidator.group(4)) data = urllib2.urlopen(dataurl).read() soup = BeautifulSoup(data) kwargs['raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (self.title, unescape(unicode(soup.find(attrs={'id' : 'textcontainer'}))).encode('utf8')) kwargs['docid']=docid super(Coment,self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid']=docid super(Coment,self).__init__(*args, **kwargs)
def __init__(self, docid=None, *args,**kwargs): setTitle=False if docid: alias=re.match(SHORTCUTRE,docid) if alias: self.__dict__['sector'] = '3' self.__dict__['year'] = alias.group(2) self.__dict__['doctype'] = SHORTCUTMAP[alias.group(1)] self.__dict__['refno'] = "%04d" % int(alias.group(3)) self.__dict__['lang'] = 'EN' # assuming default else: (code,lang)=docid.split(":")[1:3] st=7 if code[6].isalpha() else 6 self.__dict__['sector'] = code[0] self.__dict__['year'] = code[1:5] self.__dict__['doctype'] = code[5:st] self.__dict__['refno'] = code[st:] self.__dict__['lang'] = lang self.__dict__['type'] = 'eurlex' kwargs['docid']=self.docid if not Docs.find_one({"docid": self.docid}): retries=4 while True: raw=CACHE.fetchUrl(EURLEXURL+self.docid+":HTML") soup=BeautifulSoup(raw) # TODO handle empty or invalid celex ids - also handle other languages besides english!!! # <TITLE>Request Error</TITLE> # <h1>The parameters of the link are incorrect.</h1> if soup.title and soup.title.string == "Request Error": if retries>0: retries=retries-1 continue else: raise ValueError, "Request Error" if soup.h1 and soup.h1.string == 'The parameters of the link are incorrect.': if retries>0: retries=retries-1 continue else: raise ValueError, "Parameter Error" # no errors found, continue, nothing to see here break # > /* There is no English version of this document available since it was not included in the English Special Edition. content=soup.find(id='TexteOnly') if (content and content.findAll('p') and len(content.findAll('p'))>1 and 'string' in dir(content.findAll('p')[1]) and content.findAll('p')[1].string.strip().startswith('/* There is no English version of this document available since it was not included in the English Special Edition.')): raise ValueError, "Language Error" kwargs['raw']=anchorArticles(raw) self.__dict__['metadata'] = self.extractMetadata() setTitle=True super(Eurlex,self).__init__(*args, **kwargs) if setTitle: self.__dict__['title']=self._gettitle() self.save()
def frags(request): filtr={} template_vars={} docfilter=None cutoff=None pippifilter=None try: docfilter = ObjectId(cgi.escape(request.GET.get('doc',''))) except: pass if docfilter: filtr['doc']=docfilter try: pippifilter = ObjectId(cgi.escape(request.GET.get('pippi',''))) except: pass if pippifilter: filtr['pippi']=pippifilter else: try: cutoff = int(cgi.escape(request.GET.get('cutoff','7'))) except: pass if cutoff: filtr['l']={ '$gte': cutoff } orderBy = 'l' orderDesc = True template_vars=pager(request,Frags.find(filtr),orderBy,orderDesc) prevDoc=None template_vars['frags']=[] for frag in template_vars['data']: p=Pippi('',oid=frag['pippi']) d=Doc(oid=frag['doc']) if pippifilter: frag['txt']=diffFrag(prevDoc,frag['txt']) prevDoc=frag['txt'] template_vars['frags'].append({'_id': frag['_id'], 'pos': frag['pos'], 'txt': " ".join(frag['txt']), 'len': frag['l'], 'score': sum([d.tfidf.get(t,0) for t in p.pippi]), 'pippi': p, 'doc': d, }) template_vars['pippi']=pippifilter template_vars['doc']=docfilter if docfilter: template_vars['docTitle']=Docs.find_one({'_id': docfilter},['docid'])['docid'] if pippifilter: template_vars['pippiFilter']=1 #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi']) return render_to_response('frags.html', template_vars, context_instance=RequestContext(request))
def Doc(*args, **kwargs): if 'docid' in kwargs: for (t,c,r) in DOCTYPES: if re.match(r,kwargs['docid']): return c(*args,**kwargs) if 'd' in kwargs: for (t,c,r) in DOCTYPES: if kwargs['d'].get('type','') == t or re.match(r,kwargs['d'].get('docid','')): return c(*args,**kwargs) if 'oid' in kwargs: dt=Docs.find_one({"_id": kwargs['oid']},['type'])['type'] for (t,c,r) in DOCTYPES: if dt == t: return c(*args,**kwargs) return DOC(*args,**kwargs)
def pippies(request): filtr = {} template_vars = {} docfilter = None relfilter = None cutoff = None try: cutoff = int(cgi.escape(request.GET.get('cutoff', '7'))) except: pass if cutoff: filtr['len'] = {'$gte': cutoff} try: docfilter = ObjectId(cgi.escape(request.GET.get('doc', ''))) except: pass if docfilter: filtr['docs'] = docfilter try: relfilter = int(cgi.escape(request.GET.get('relevance', ''))) except: pass if relfilter: filtr['relevance'] = relfilter # todo add sortable column headers ala http://djangosnippets.org/snippets/308/ orderBy = cgi.escape(request.GET.get('orderby', 'relevance')) orderDesc = True if '1' == cgi.escape(request.GET.get('desc', '1')) else False template_vars = pager(request, Pippies.find(filtr), orderBy, orderDesc) template_vars['pippies'] = [{ 'id': pippi['_id'], 'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]), 'docslen': len(pippi['docs']), 'relevance': pippi.get('relevance', 0), } for pippi in template_vars['data']] template_vars['doc'] = docfilter if docfilter: doc = Docs.find_one({'_id': docfilter}, ['docid', 'title']) template_vars[ 'docTitle'] = doc['title'] if 'title' in doc else doc['docid'] return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
def __init__(self, docid=None, *args, **kwargs): self.__dict__['type'] = 'co-ment' if docid: hostValidator = CMTRE.search(docid) if hostValidator: if hostValidator.group(1) or hostValidator.group( 3) or hostValidator.group(5): docid = ("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8') kwargs['docid'] = docid url = "https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4)) if not Docs.find_one({"docid": docid}): context = urllib2.urlopen(url).read() soup = BeautifulSoup(context) self.__dict__['title'] = unescape( unicode(''.join(soup.title.findAll( text=True)))).strip().encode('utf8') dataurl = "https://%s/text%s/comments/" % ( hostValidator.group(2), hostValidator.group(4)) data = urllib2.urlopen(dataurl).read() soup = BeautifulSoup(data) kwargs[ 'raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % ( self.title, unescape( unicode( soup.find(attrs={'id': 'textcontainer' }))).encode('utf8')) kwargs['docid'] = docid super(Coment, self).__init__(*args, **kwargs) if not 'stems' in self.__dict__ or not self.stems: # let's calculate and cache the results models.tfidf.add_input_document(self.termcnt.keys()) self.save() return kwargs['docid'] = docid super(Coment, self).__init__(*args, **kwargs)
def pippies(request): filtr={} template_vars={} docfilter=None relfilter=None cutoff=None try: cutoff = int(cgi.escape(request.GET.get('cutoff','7'))) except: pass if cutoff: filtr['len']={ '$gte': cutoff } try: docfilter = ObjectId(cgi.escape(request.GET.get('doc',''))) except: pass if docfilter: filtr['docs']=docfilter try: relfilter = int(cgi.escape(request.GET.get('relevance',''))) except: pass if relfilter: filtr['relevance']=relfilter # todo add sortable column headers ala http://djangosnippets.org/snippets/308/ orderBy = cgi.escape(request.GET.get('orderby','relevance')) orderDesc = True if '1'==cgi.escape(request.GET.get('desc','1')) else False template_vars=pager(request,Pippies.find(filtr),orderBy,orderDesc) template_vars['pippies']=[{'id': pippi['_id'], 'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]), 'docslen':len(pippi['docs']), 'relevance':pippi.get('relevance',0),} for pippi in template_vars['data']] template_vars['doc']=docfilter if docfilter: doc=Docs.find_one({'_id': docfilter},['docid', 'title']) template_vars['docTitle']=doc['title'] if 'title' in doc else doc['docid'] return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
def frags(request): filtr = {} template_vars = {} docfilter = None cutoff = None pippifilter = None try: docfilter = ObjectId(cgi.escape(request.GET.get('doc', ''))) except: pass if docfilter: filtr['doc'] = docfilter try: pippifilter = ObjectId(cgi.escape(request.GET.get('pippi', ''))) except: pass if pippifilter: filtr['pippi'] = pippifilter else: try: cutoff = int(cgi.escape(request.GET.get('cutoff', '7'))) except: pass if cutoff: filtr['l'] = {'$gte': cutoff} orderBy = 'l' orderDesc = True template_vars = pager(request, Frags.find(filtr), orderBy, orderDesc) prevDoc = None template_vars['frags'] = [] for frag in template_vars['data']: p = Pippi('', oid=frag['pippi']) d = Doc(oid=frag['doc']) if pippifilter: frag['txt'] = diffFrag(prevDoc, frag['txt']) prevDoc = frag['txt'] template_vars['frags'].append({ '_id': frag['_id'], 'pos': frag['pos'], 'txt': " ".join(frag['txt']), 'len': frag['l'], 'score': sum([d.tfidf.get(t, 0) for t in p.pippi]), 'pippi': p, 'doc': d, }) template_vars['pippi'] = pippifilter template_vars['doc'] = docfilter if docfilter: template_vars['docTitle'] = Docs.find_one({'_id': docfilter}, ['docid'])['docid'] if pippifilter: template_vars[ 'pippiFilter'] = 1 #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi']) return render_to_response('frags.html', template_vars, context_instance=RequestContext(request))