Пример #1
0
def remove_tags(self, s):
    d = {
        '–': '-',
        '·': '.',
        ' ': ' ',
        '“': '',
        '§': '',
        'Ä': u'Ä',
        'Ö': u'Ö',
        'Ü': u'Ü',
        'ä': u'ä',
        'ö': u'ö',
        'ü': u'ü',
        'ß': u'ß'
    }
    s = str(s)
    for x in d:
        s = s.replace(x, d[x])
    s = standard.re_sub('<script(.*?)>(.|\\n|\\r|\\t)*?</script>', ' ', s)
    s = standard.re_sub('<style(.*?)>(.|\\n|\\r|\\t)*?</style>', ' ', s)
    s = standard.re_sub('<[^>]*>', ' ', s)
    while s.find('\t') >= 0:
        s = s.replace('\t', ' ')
    while s.find('\n') >= 0:
        s = s.replace('\n', ' ')
    while s.find('\r') >= 0:
        s = s.replace('\r', ' ')
    while s.find('  ') >= 0:
        s = s.replace('  ', ' ')
    s = s.strip()
    return s
Пример #2
0
 def search_xml(self,
                q,
                page_index=0,
                page_size=10,
                REQUEST=None,
                RESPONSE=None):
     """ ZMSZCatalogSolrConnector.search_xml """
     # Check constraints.
     zcm = self.getCatalogAdapter()
     attrs = zcm.getAttrs()
     page_index = int(page_index)
     page_size = int(page_size)
     REQUEST.set('lang', REQUEST.get('lang', self.getPrimaryLanguage()))
     RESPONSE = REQUEST.RESPONSE
     content_type = 'text/xml; charset=utf-8'
     RESPONSE.setHeader('Content-Type', content_type)
     RESPONSE.setHeader('Cache-Control', 'no-cache')
     RESPONSE.setHeader('Pragma', 'no-cache')
     RESPONSE.setHeader('Access-Control-Allow-Origin', '*')
     # Execute query.
     p = {}
     p['q'] = q
     p['wt'] = 'xml'
     p['start'] = page_index
     p['rows'] = page_size
     p['defType'] = 'edismax'
     p['qf'] = ' '.join([
         '%s^%s' % (self._get_field_name(x),
                    standard.pystr(attrs[x].get('boost', 1.0)))
         for x in attrs
     ])
     p['hl'] = 'true'
     p['hl.fragsize'] = self.getConfProperty('solr.select.hl.fragsize', 200)
     p['hl.fl'] = self.getConfProperty(
         'solr.select.hl.fl',
         ','.join([self._get_field_name(x) for x in attrs]))
     p['hl.simple.pre'] = self.getConfProperty('solr.select.hl.simple.pre',
                                               '<span class="highlight">')
     p['hl.simple.post'] = self.getConfProperty(
         'solr.select.hl.simple.post', '</span>')
     solr_url = self.getConfProperty('solr.url',
                                     'http://localhost:8983/solr')
     solr_core = self.getConfProperty('solr.core',
                                      self.getAbsoluteHome().id)
     url = '%s/%s/select' % (solr_url, solr_core)
     url = self.url_append_params(url, p, sep='&')
     result = self.http_import(url, method='GET')
     result = standard.re_sub('name="(.*?)_[ist]"', 'name="\\1"', result)
     return result
Пример #3
0
 def re_sub(self, pattern, replacement, subject, ignorecase=False):
     warn(self, 're_sub', 'Products.zms.standard.re_sub')
     return standard.re_sub(pattern, replacement, subject, ignorecase)
Пример #4
0
    def recurse_downloadHtmlPages(self, obj, path, lang, REQUEST):
        try:
            os.mkdir(path)
        except:
            pass

        level = obj.getLevel()
        key = 'index'
        REQUEST.set('ZMS_PATH_HANDLER', True)
        try:

            # Remember others.
            others = copy.copy(REQUEST.other.keys())

            root = getattr(obj, '__root__', None)
            if root is not None:
                REQUEST.set('ZMS_PROXY_%s' % root.id, obj)
                html = root.f_index_html(root, REQUEST)
            else:
                html = obj.f_index_html(obj, REQUEST)

            # Remove new others.
            for rk in REQUEST.other.keys():
                if rk not in others:
                    try:
                        del REQUEST.other[rk]
                    except:
                        pass

            # Blank lines in includes cause PHP session errors
            # @see http://bugs.php.net/bug.php?id=8974
            html = standard.re_sub(r'^\s*', '', html)

            # Localize html.
            html = localHtml(obj, html)

            # Save html to file.
            if level > 0 and \
               self.getConfProperty('ZMS.pathhandler', 0) != 0 and \
               self.getConfProperty('ZMS.export.pathhandler', 0) == 1:
                html = localIndexHtml(self, obj, level - 1, html)
                filename = '%s/../%s%s' % (path, obj.getDeclId(REQUEST),
                                           obj.getPageExt(REQUEST))
            else:
                pageext = obj.getPageExt(REQUEST)
                html = localIndexHtml(self, obj, level - self.getLevel(), html)
                filename = '%s/%s_%s%s' % (path, key, lang, pageext)

            html = self.exportExternalResources(obj, html, path, REQUEST)

            # @see http://docs.python.org/howto/unicode.html (Reading and Writing Unicode Data)
            encoding = REQUEST.get('ZMS_CHARSET', 'utf-8')
            mode = 'w'
            writeFile(obj, filename, html, mode, encoding)

            # Root folder requires and defaults to "index.html" at most systems.
            if lang == self.getPrimaryLanguage():
                filename = '%s/%s%s' % (path, key, obj.getPageExt(REQUEST))
                writeFile(obj, filename, html, mode, encoding)

        except:
            standard.writeError(
                obj, "[recurse_downloadHtmlPages]: Can't get html '%s'" % key)

        # Process methods of meta-objects.
        for metadictAttrId in self.getMetaobjAttrIds(obj.meta_id):
            try:
                metadictAttr = self.getMetaobjAttr(obj.meta_id, metadictAttrId)
                if metadictAttr is not None and metadictAttr[
                        'meta_type'] and metadictAttr[
                            'type'] in self.getMetaobjIds():
                    metaObj = self.getMetaobj(metadictAttr['type'])
                    if metaObj['type'] == 'ZMSResource':
                        for metadictObj in obj.getObjChildren(
                                metadictAttr['id'], REQUEST):
                            for metaObjAttr in metaObj['attrs']:
                                if metaObjAttr['type'] in [
                                        'DTML Document', 'DTML Method',
                                        'Page Template', 'Script (Python)'
                                ]:
                                    ob = getattr(obj, metaObjAttr['id'])
                                    html = ob(obj, REQUEST)
                                    html = localHtml(obj, html)
                                    filename = '%s/%s' % (path,
                                                          metaObjAttr['id'])
                                    f = open(filename, 'w')
                                    f.write(html)
                                    f.close()
            except:
                standard.writeError(
                    self,
                    "[recurse_downloadHtmlPages]: can't process method '%s' of meta-object"
                    % metadictAttr)

        # Process children.
        for child in obj.filteredChildNodes(REQUEST, self.PAGES):
            self.recurse_downloadHtmlPages(
                child, '%s/%s' % (path, child.getDeclId(REQUEST)), lang,
                REQUEST)
def manage_zcatalog_update_documents(self):
    msg = []
    request = self.REQUEST
    RESPONSE = request.RESPONSE
    zmscontext = self.getLinkObj(request.get('uid', '{$}'))
    home = zmscontext.getDocumentElement()
    home_id = home.getPhysicalPath()
    home_id = home_id[home_id.index('content') - 1]
    inst_home = zmscontext.Control_Panel.getINSTANCE_HOME()
    path = home_id
    node = home
    while True:
        node = node.getPortalMaster()
        if node is None: break
        path = node.getHome().getId() + '/' + path

    xmlpath = '%s/var/%s/sitemap.xml' % (inst_home, path)
    xml = standard.localfs_read(xmlpath, mode={
        'threshold': -1
    }).decode('utf-8')
    msg.append('%i xml-bytes read from %s' % (len(xml), xmlpath))

    def update(action, xml):
        solr_url = zmscontext.getConfProperty('solr.url',
                                              'http://*****:*****@@')
        if i >= 0:
            body = doc[i + len('">@@'):]
            body = body[:body.find('</field>')]
            id = standard.re_search('<field name="id"(.*?)>(.*?)<\/field>',
                                    doc)[1]
            bins.append({'id': id, 'body': body})
            doc = standard.re_sub(
                '<field name="(.*?)_t"(.*?)>@@(.*?)<\/field>',
                '<field name="\\1_t"\\2></field>', doc)
        buff.append(doc)
        docs.remove(docs[0])
    msg.append(update("commit", get_command_xml('commit')))
    for b in bins:
        body = b['body'].split(':')
        ids = body[0].split('/')[1:]
        attr_name = body[1]
        node = home
        while ids:
            node = getattr(node, ids.pop(0))
        d = {
            'literal.id': b['id'],
            'commit': 'true',
            'myfile': node.attr(attr_name).getData(request)
        }
        msg.append(update_extract("extract", d))
    msg.append(update("optimize", get_command_xml('optimize')))

    RESPONSE.setHeader('Content-Type', 'text/plain;charset=utf-8')
    msg.append("Done!")
    return '\n'.join(msg)


# --// /manage_zcatalog_update_documents //--