Пример #1
0
 def __init__(self,raw=None,docid=None,oid=None,d=None):
     if oid:
         # get by mongo oid
         d=Docs.find_one({"_id": oid})
     elif docid:
         # get by docid
         d=Docs.find_one({"docid": docid})
     if d:
         # load the values
         self.__dict__.update(d)
     elif raw:
         # create a new document
         self.__dict__.update({
             'docid' : docid,
             'pippies' : [],
             'pippiDocs' : [],
             'pippiDocsLen' : 0,
             'rawid' : None,
             })
         if not 'type' in self.__dict__:
             self.__dict__['type']='raw'
         if not 'metadata' in self.__dict__:
             self.__dict__['metadata']={}
         if raw:
             self.raw=raw
             self.lang=guessLanguage(" ".join(self.text))
         self.save()
     else:
         raise KeyError('empty docid')
Пример #2
0
 def __init__(self,raw=None,docid=None,oid=None,d=None,owner=None):
     if oid:
         # get by mongo oid
         d=Docs.find_one({"_id": oid})
     elif docid:
         # get by docid
         d=Docs.find_one({"docid": docid})
     if d:
         # load the values
         self.__dict__.update(d)
     elif raw:
         # create a new document
         self.__dict__.update({
             'docid' : docid,
             'owner': unicode(owner),
             'pippies' : [],
             'pippiDocs' : [],
             'pippiDocsLen' : 0,
             'rawid' : None,
             'title': docid,
             })
         if not 'type' in self.__dict__:
             self.__dict__['type']='raw'
         if not 'metadata' in self.__dict__:
             self.__dict__['metadata']={}
         self.raw=raw
         self.__dict__['lang']=guessLanguage(" ".join(self.text))
         self.stems # for caching
         self.save()
     else:
         raise KeyError('empty docid')
Пример #3
0
    def __init__(self, docid=None, *args,**kwargs):
        self.__dict__['type'] = 'etherpad'
        if docid:
            hostValidator = PADRE.search(docid)
            if hostValidator:
                if hostValidator.group(2) and hostValidator.group(3):
                    docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8')
                    kwargs['docid']=docid
                url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')

                    doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8'))
                    raw=str(tidy.parseString(doc, **{'output_xhtml' : 1,
                                                             'add_xml_decl' : 0,
                                                             'indent' : 0,
                                                             'tidy_mark' : 0,
                                                             'doctype' : "strict",
                                                             'wrap' : 0}))
                    kwargs['raw'] = raw
                    kwargs['docid']=docid
                    super(Etherpad,self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid']=docid
        super(Etherpad,self).__init__(*args, **kwargs)
Пример #4
0
    def __init__(self, docid=None, *args,**kwargs):
        self.__dict__['type'] = 'co-ment'
        if docid:
            hostValidator = CMTRE.search(docid)
            if hostValidator:
                if hostValidator.group(1) or hostValidator.group(3) or hostValidator.group(5):
                    docid=("%s%s" % (hostValidator.group(2), hostValidator.group(4))).encode('utf8')
                    kwargs['docid']=docid
                url="https://%s/text/%s/view/" % (hostValidator.group(2), hostValidator.group(4))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')

                    dataurl = "https://%s/text%s/comments/" % (hostValidator.group(2), hostValidator.group(4))
                    data = urllib2.urlopen(dataurl).read()
                    soup = BeautifulSoup(data)

                    kwargs['raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (self.title, unescape(unicode(soup.find(attrs={'id' : 'textcontainer'}))).encode('utf8'))
                    kwargs['docid']=docid
                    super(Coment,self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid']=docid
        super(Coment,self).__init__(*args, **kwargs)
Пример #5
0
 def __init__(self, docid=None, *args,**kwargs):
     setTitle=False
     if docid:
         alias=re.match(SHORTCUTRE,docid)
         if alias:
             self.__dict__['sector'] = '3'
             self.__dict__['year'] = alias.group(2)
             self.__dict__['doctype'] = SHORTCUTMAP[alias.group(1)]
             self.__dict__['refno'] = "%04d" % int(alias.group(3))
             self.__dict__['lang'] = 'EN' # assuming default
         else:
             (code,lang)=docid.split(":")[1:3]
             st=7 if code[6].isalpha() else 6
             self.__dict__['sector'] = code[0]
             self.__dict__['year'] = code[1:5]
             self.__dict__['doctype'] = code[5:st]
             self.__dict__['refno'] = code[st:]
             self.__dict__['lang'] = lang
         self.__dict__['type'] = 'eurlex'
         kwargs['docid']=self.docid
         if not Docs.find_one({"docid": self.docid}):
             retries=4
             while True:
                 raw=CACHE.fetchUrl(EURLEXURL+self.docid+":HTML")
                 soup=BeautifulSoup(raw)
                 # TODO handle empty or invalid celex ids - also handle other languages besides english!!!
                 # <TITLE>Request Error</TITLE>
                 # <h1>The parameters of the link are incorrect.</h1>
                 if soup.title and soup.title.string == "Request Error":
                     if retries>0:
                         retries=retries-1
                         continue
                     else:
                         raise ValueError, "Request Error"
                 if soup.h1 and soup.h1.string == 'The parameters of the link are incorrect.':
                     if retries>0:
                         retries=retries-1
                         continue
                     else:
                         raise ValueError, "Parameter Error"
                 # no errors found, continue, nothing to see here
                 break
             # > /* There is no English version of this document available since it was not included in the English Special Edition.
             content=soup.find(id='TexteOnly')
             if (content and
                 content.findAll('p') and
                 len(content.findAll('p'))>1 and
                 'string' in dir(content.findAll('p')[1]) and
                 content.findAll('p')[1].string.strip().startswith('/* There is no English version of this document available since it was not included in the English Special Edition.')):
                 raise ValueError, "Language Error"
             kwargs['raw']=anchorArticles(raw)
             self.__dict__['metadata'] = self.extractMetadata()
             setTitle=True
     super(Eurlex,self).__init__(*args, **kwargs)
     if setTitle:
         self.__dict__['title']=self._gettitle()
         self.save()
Пример #6
0
def frags(request):
    filtr={}
    template_vars={}
    docfilter=None
    cutoff=None
    pippifilter=None
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc','')))
    except:
        pass
    if docfilter:
        filtr['doc']=docfilter
    try:
        pippifilter = ObjectId(cgi.escape(request.GET.get('pippi','')))
    except:
        pass
    if pippifilter:
        filtr['pippi']=pippifilter
    else:
        try:
            cutoff = int(cgi.escape(request.GET.get('cutoff','7')))
        except:
            pass
    if cutoff: filtr['l']={ '$gte': cutoff }
    orderBy = 'l'
    orderDesc = True
    template_vars=pager(request,Frags.find(filtr),orderBy,orderDesc)
    prevDoc=None
    template_vars['frags']=[]
    for frag in template_vars['data']:
        p=Pippi('',oid=frag['pippi'])
        d=Doc(oid=frag['doc'])
        if pippifilter:
            frag['txt']=diffFrag(prevDoc,frag['txt'])
            prevDoc=frag['txt']
        template_vars['frags'].append({'_id': frag['_id'],
                                       'pos': frag['pos'],
                                       'txt': " ".join(frag['txt']),
                                       'len': frag['l'],
                                       'score': sum([d.tfidf.get(t,0) for t in p.pippi]),
                                       'pippi': p,
                                       'doc': d,
                                       })

    template_vars['pippi']=pippifilter
    template_vars['doc']=docfilter
    if docfilter: template_vars['docTitle']=Docs.find_one({'_id': docfilter},['docid'])['docid']
    if pippifilter: template_vars['pippiFilter']=1 #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi'])
    return render_to_response('frags.html', template_vars, context_instance=RequestContext(request))
Пример #7
0
def Doc(*args, **kwargs):
    if 'docid' in kwargs:
        for (t,c,r) in DOCTYPES:
            if re.match(r,kwargs['docid']):
                return c(*args,**kwargs)
    if 'd' in kwargs:
        for (t,c,r) in DOCTYPES:
            if kwargs['d'].get('type','') == t or re.match(r,kwargs['d'].get('docid','')):
                return c(*args,**kwargs)
    if 'oid' in kwargs:
        dt=Docs.find_one({"_id": kwargs['oid']},['type'])['type']
        for (t,c,r) in DOCTYPES:
            if dt == t:
                return c(*args,**kwargs)
    return DOC(*args,**kwargs)
Пример #8
0
def Doc(*args, **kwargs):
    if 'docid' in kwargs:
        for (t,c,r) in DOCTYPES:
            if re.match(r,kwargs['docid']):
                return c(*args,**kwargs)
    if 'd' in kwargs:
        for (t,c,r) in DOCTYPES:
            if kwargs['d'].get('type','') == t or re.match(r,kwargs['d'].get('docid','')):
                return c(*args,**kwargs)
    if 'oid' in kwargs:
        dt=Docs.find_one({"_id": kwargs['oid']},['type'])['type']
        for (t,c,r) in DOCTYPES:
            if dt == t:
                return c(*args,**kwargs)
    return DOC(*args,**kwargs)
Пример #9
0
def pippies(request):
    filtr = {}
    template_vars = {}
    docfilter = None
    relfilter = None
    cutoff = None
    try:
        cutoff = int(cgi.escape(request.GET.get('cutoff', '7')))
    except:
        pass
    if cutoff: filtr['len'] = {'$gte': cutoff}
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc', '')))
    except:
        pass
    if docfilter:
        filtr['docs'] = docfilter
    try:
        relfilter = int(cgi.escape(request.GET.get('relevance', '')))
    except:
        pass
    if relfilter: filtr['relevance'] = relfilter
    # todo add sortable column headers ala http://djangosnippets.org/snippets/308/
    orderBy = cgi.escape(request.GET.get('orderby', 'relevance'))
    orderDesc = True if '1' == cgi.escape(request.GET.get('desc',
                                                          '1')) else False
    template_vars = pager(request, Pippies.find(filtr), orderBy, orderDesc)
    template_vars['pippies'] = [{
        'id':
        pippi['_id'],
        'pippi':
        ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]),
        'docslen':
        len(pippi['docs']),
        'relevance':
        pippi.get('relevance', 0),
    } for pippi in template_vars['data']]
    template_vars['doc'] = docfilter
    if docfilter:
        doc = Docs.find_one({'_id': docfilter}, ['docid', 'title'])
        template_vars[
            'docTitle'] = doc['title'] if 'title' in doc else doc['docid']
    return render_to_response('pippies.html',
                              template_vars,
                              context_instance=RequestContext(request))
Пример #10
0
    def __init__(self, docid=None, *args, **kwargs):
        self.__dict__['type'] = 'co-ment'
        if docid:
            hostValidator = CMTRE.search(docid)
            if hostValidator:
                if hostValidator.group(1) or hostValidator.group(
                        3) or hostValidator.group(5):
                    docid = ("%s%s" % (hostValidator.group(2),
                                       hostValidator.group(4))).encode('utf8')
                    kwargs['docid'] = docid
                url = "https://%s/text/%s/view/" % (hostValidator.group(2),
                                                    hostValidator.group(4))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title'] = unescape(
                        unicode(''.join(soup.title.findAll(
                            text=True)))).strip().encode('utf8')

                    dataurl = "https://%s/text%s/comments/" % (
                        hostValidator.group(2), hostValidator.group(4))
                    data = urllib2.urlopen(dataurl).read()
                    soup = BeautifulSoup(data)

                    kwargs[
                        'raw'] = '<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>%s</body></html>' % (
                            self.title,
                            unescape(
                                unicode(
                                    soup.find(attrs={'id': 'textcontainer'
                                                     }))).encode('utf8'))
                    kwargs['docid'] = docid
                    super(Coment, self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid'] = docid
        super(Coment, self).__init__(*args, **kwargs)
Пример #11
0
def pippies(request):
    filtr={}
    template_vars={}
    docfilter=None
    relfilter=None
    cutoff=None
    try:
        cutoff = int(cgi.escape(request.GET.get('cutoff','7')))
    except:
        pass
    if cutoff: filtr['len']={ '$gte': cutoff }
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc','')))
    except:
        pass
    if docfilter:
        filtr['docs']=docfilter
    try:
        relfilter =  int(cgi.escape(request.GET.get('relevance','')))
    except:
        pass
    if relfilter: filtr['relevance']=relfilter
    # todo add sortable column headers ala http://djangosnippets.org/snippets/308/
    orderBy = cgi.escape(request.GET.get('orderby','relevance'))
    orderDesc = True if '1'==cgi.escape(request.GET.get('desc','1')) else False
    template_vars=pager(request,Pippies.find(filtr),orderBy,orderDesc)
    template_vars['pippies']=[{'id': pippi['_id'],
                               'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]),
                               'docslen':len(pippi['docs']),
                               'relevance':pippi.get('relevance',0),}
                               for pippi in template_vars['data']]
    template_vars['doc']=docfilter
    if docfilter:
        doc=Docs.find_one({'_id': docfilter},['docid', 'title'])
        template_vars['docTitle']=doc['title'] if 'title' in doc else doc['docid']
    return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
Пример #12
0
def frags(request):
    filtr = {}
    template_vars = {}
    docfilter = None
    cutoff = None
    pippifilter = None
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc', '')))
    except:
        pass
    if docfilter:
        filtr['doc'] = docfilter
    try:
        pippifilter = ObjectId(cgi.escape(request.GET.get('pippi', '')))
    except:
        pass
    if pippifilter:
        filtr['pippi'] = pippifilter
    else:
        try:
            cutoff = int(cgi.escape(request.GET.get('cutoff', '7')))
        except:
            pass
    if cutoff: filtr['l'] = {'$gte': cutoff}
    orderBy = 'l'
    orderDesc = True
    template_vars = pager(request, Frags.find(filtr), orderBy, orderDesc)
    prevDoc = None
    template_vars['frags'] = []
    for frag in template_vars['data']:
        p = Pippi('', oid=frag['pippi'])
        d = Doc(oid=frag['doc'])
        if pippifilter:
            frag['txt'] = diffFrag(prevDoc, frag['txt'])
            prevDoc = frag['txt']
        template_vars['frags'].append({
            '_id':
            frag['_id'],
            'pos':
            frag['pos'],
            'txt':
            " ".join(frag['txt']),
            'len':
            frag['l'],
            'score':
            sum([d.tfidf.get(t, 0) for t in p.pippi]),
            'pippi':
            p,
            'doc':
            d,
        })

    template_vars['pippi'] = pippifilter
    template_vars['doc'] = docfilter
    if docfilter:
        template_vars['docTitle'] = Docs.find_one({'_id': docfilter},
                                                  ['docid'])['docid']
    if pippifilter:
        template_vars[
            'pippiFilter'] = 1  #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi'])
    return render_to_response('frags.html',
                              template_vars,
                              context_instance=RequestContext(request))