Exemplos de removeEncodingDeclaration em Python, exemplos de commonutil.lxmlutil.removeEncodingDeclaration em Python

Exemplo n.º 1

0

Exibir arquivo

    def post(self):
        self.response.headers['Content-Type'] = 'text/plain'
        data = json.loads(self.request.body)
        callbackurl = data['callbackurl']

        triedcount = data.get('triedcount', 0)
        monitorRequest = data['request']
        feedback = {}
        urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback)
        slug = monitorRequest['slug']
        fetchurl = monitorRequest['fetchurl']
        if not content:
            triedcount += 1
            leftcount = _FETCH_TRYCOUNT - triedcount
            message = 'Failed to fetch content form %s for %s, lefted: %s.' % (
                fetchurl,
                slug,
                leftcount,
            )
            logging.error(message)
            self.response.out.write(message)
            if leftcount > 0:
                data['triedcount'] = triedcount
                taskqueue.add(queue_name="default",
                              payload=json.dumps(data),
                              url='/fetch/single/')
                return
        items = None
        responseData = None
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            selector = monitorRequest['selector']
            conditions = monitorRequest.get('conditions', {})
            formatter = monitorRequest.get('formatter')
            parser = HtmlContentParser()
            items = parser.parse(urlUsed, content, selector, conditions,
                                 formatter)

            if items and conditions and conditions.get('detectdetail'):
                detaildetector.populateDetailUrls(items)
        sourceSlug = data['origin']['common']['slug']
        if items:
            sourceDeprecated = models.isSourceDeprecated(sourceSlug)
            if sourceDeprecated:
                models.removeDeprecatedSource(sourceSlug)
            message = 'Items got for %s.' % (slug, )
            logging.info(message)
            self.response.out.write(message)

            oldhash = monitorRequest['fetchhash']
            fetchhash = _calculateHash(items)
            if oldhash != fetchhash or sourceDeprecated:
                responseData = {
                    'origin': data['origin'],
                    'result': {
                        'items': items,
                        'fetchhash': fetchhash,
                    },
                }
        else:
            models.addDeprecatedSource(sourceSlug)
            responseData = {
                'origin': data['origin'],
                'result': None,
            }

            if content:
                message = 'Failed to parse items from %s for %s by %s.' % (
                    fetchurl, slug, selector)
            elif feedback.get('overflow'):
                message = 'Quote overflow.'
                responseData['overflow'] = True
            else:
                message = 'Failed to fetch content from %s for %s.' % (
                    fetchurl, slug)
            logging.error(message)
            self.response.out.write(message)

        if responseData:
            success = networkutil.postData(callbackurl,
                                           responseData,
                                           tag=slug,
                                           trycount=_CALLBACK_TRYCOUNT,
                                           timeout=_URL_TIMEOUT)

            if success:
                message = 'Push items back for %s to %s.' % (slug, callbackurl)
            else:
                message = 'Failed to push items back for %s to %s.' % (
                    slug, callbackurl)
            logging.info(message)
            self.response.out.write(message)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: handlersapi.py Projeto: economylin/newsmonitor

    def post(self):
        self.response.headers['Content-Type'] = 'text/plain'
        data = json.loads(self.request.body)
        callbackurl = data['callbackurl']

        triedcount = data.get('triedcount', 0)
        monitorRequest = data['request']
        feedback = {}
        urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback)
        slug = monitorRequest['slug']
        fetchurl = monitorRequest['fetchurl']
        if not content:
            triedcount += 1
            leftcount = _FETCH_TRYCOUNT - triedcount
            message = 'Failed to fetch content form %s for %s, lefted: %s.' % (
                        fetchurl, slug, leftcount, )
            logging.error(message)
            self.response.out.write(message)
            if leftcount > 0:
                data['triedcount'] = triedcount
                taskqueue.add(queue_name="default", payload=json.dumps(data),
                            url='/fetch/single/')
                return
        items = None
        responseData = None
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            selector = monitorRequest['selector']
            conditions = monitorRequest.get('conditions', {})
            formatter = monitorRequest.get('formatter')
            parser = HtmlContentParser()
            items = parser.parse(urlUsed, content, selector, conditions, formatter)

            if items and conditions and conditions.get('detectdetail'):
                detaildetector.populateDetailUrls(items)
        sourceSlug = data['origin']['common']['slug']
        if items:
            sourceDeprecated = models.isSourceDeprecated(sourceSlug)
            if sourceDeprecated:
                models.removeDeprecatedSource(sourceSlug)
            message = 'Items got for %s.' % (slug, )
            logging.info(message)
            self.response.out.write(message)

            oldhash = monitorRequest['fetchhash']
            fetchhash = _calculateHash(items)
            if oldhash != fetchhash or sourceDeprecated:
                responseData = {
                        'origin': data['origin'],
                        'result': {
                            'items': items,
                            'fetchhash': fetchhash,
                        },
                }
        else:
            models.addDeprecatedSource(sourceSlug)
            responseData = {
                    'origin': data['origin'],
                    'result': None,
                }

            if content:
                message = 'Failed to parse items from %s for %s by %s.' % (
                                      fetchurl, slug, selector)
            elif feedback.get('overflow'):
                message = 'Quote overflow.'
                responseData['overflow'] = True
            else:
                message = 'Failed to fetch content from %s for %s.' % (
                                        fetchurl, slug)
            logging.error(message)
            self.response.out.write(message)

        if responseData:
            success = networkutil.postData(callbackurl, responseData, tag=slug,
                        trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)

            if success:
                message = 'Push items back for %s to %s.' % (slug, callbackurl)
            else:
                message = 'Failed to push items back for %s to %s.' % (slug, callbackurl)
            logging.info(message)
            self.response.out.write(message)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: handlers.py Projeto: economylin/newsmonitor

    def post(self):
        action = self.request.get('action')
        keyword = ''
        pageinfo = None
        if action == 'JSON':
            jsonstr = self.request.get('jsonstr')
            if jsonstr:
                newssource = json.loads(jsonstr)
            else:
                newssource = _DEFAULT_NEWSSOURCE
            encodingUsed = ''
            urlUsed = ''
            content = ''
            httpheader = ''
            formatter = ''
        else:
            keyword = self.request.get('keyword').strip()
            pageinfo = self.request.get('pageinfo').strip()
            if pageinfo:
                pageinfo = json.loads(pageinfo)
            newssource = {}
            newssource['active'] = bool(self.request.get('active'))
            newssource['slug'] = self.request.get('slug')
            newssource['name'] = self.request.get('name')
            newssource['order'] = self.request.get('order')
            newssource['charts'] = bool(self.request.get('charts'))
            newssource['fetchurl'] = self.request.get('fetchurl')
            if newssource['fetchurl'] and not newssource['fetchurl'].startswith('http'):
                newssource['fetchurl'] = 'http://' + newssource['fetchurl']
            if not newssource['slug'] and newssource['fetchurl']:
                newssource['slug'] = urlparse.urlparse(newssource['fetchurl']).netloc
            httpheader = self.request.get('httpheader')
            if httpheader:
                newssource['header'] = json.loads(httpheader)
            newssource['encoding'] = self.request.get('encoding')
            newssource['tags'] = self.request.get('tags')

            # following fields only for showing parsed result.
            encodingUsed = self.request.get('encodingUsed')
            urlUsed = self.request.get('urlUsed')
            oldContent = self.request.get('oldContent')

            newssource['selector'] = self.request.get('selector').strip()
            conditions = {}
            conditions['returnall'] = bool(self.request.get('returnall'))
            conditions['emptytitle'] = bool(self.request.get('emptytitle'))
            conditions['detectdetail'] = bool(self.request.get('detectdetail'))
            conditions['scripttext'] = bool(self.request.get('scripttext'))
            excludeselector = self.request.get('excludeselector').strip()
            if excludeselector:
                if 'exclude' not in conditions:
                    conditions['exclude'] = {}
                conditions['exclude']['selector'] = excludeselector

            includeselector = self.request.get('includeselector').strip()
            if includeselector:
                if 'include' not in conditions:
                    conditions['include'] = {}
                conditions['include']['selector'] = includeselector
            urlselector = self.request.get('urlselector').strip()
            titleselector = self.request.get('titleselector').strip()
            imageselector = self.request.get('imageselector').strip()
            contentselector = self.request.get('contentselector').strip()
            linkselector = self.request.get('linkselector').strip()
            imagelinkselector = self.request.get('imagelinkselector').strip()
            if urlselector or titleselector or contentselector or \
                imageselector or linkselector or imagelinkselector:
                conditions['criterion'] = {}
                if urlselector:
                    conditions['criterion']['url'] = urlselector
                if titleselector:
                    conditions['criterion']['title'] = titleselector
                if contentselector:
                    conditions['criterion']['content'] = contentselector
                if imageselector:
                    conditions['criterion']['image'] = imageselector
                if linkselector:
                    conditions['criterion']['link'] = linkselector
                if imagelinkselector:
                    conditions['criterion']['imagelink'] = imagelinkselector
            newssource['conditions'] = conditions

            formatter = self.request.get('formatter')
            if formatter:
                newssource['formatter'] = json.loads(formatter)

            newssource['description'] = self.request.get('description').strip()

            content = self.request.get('content')
            jsonstr = jsonutil.getReadableString(newssource)

        if 'active' not in newssource:
            newssource['active'] = True

        items = []
        links = []
        selector = newssource.get('selector')
        fetchurl = newssource.get('fetchurl')

        tried = 2 # the max try count is 3
        if not content and fetchurl:
            fetcher = ContentFetcher(fetchurl,
                            header=newssource.get('header'),
                            encoding=newssource.get('encoding'), tried=tried
                         )
            fetchResult = fetcher.fetch()
            content = fetchResult.get('content')
            oldContent = fetchResult.get('content.old')
            urlUsed = fetchResult.get('url')
            encodingUsed = '%s-%s' % (fetchResult.get('encoding'),
                                fetchResult.get('encoding.src'))
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            if selector:
                parser = HtmlContentParser()
                items = parser.parse(urlUsed, content, selector,
                newssource.get('conditions'), newssource.get('formatter'))
            else:
                links = linkdetector.detect(content, keyword)

        if items and newssource.get('conditions', {}).get('detectdetail'):
            detaildetector.populateDetailUrls(items)

        if newssource.get('header'):
            httpheader = jsonutil.getReadableString(newssource['header'])

        if newssource.get('formatter'):
            formatter = jsonutil.getReadableString(newssource['formatter'])

        if not pageinfo and fetchurl:
            pageinfo = pmapi.getPage(fetchurl)

        templateValues = {
            'newssource': newssource,
            'httpheader': httpheader,
            'formatter': formatter,
            'content': content,
            'oldContent': oldContent,
            'encodingUsed': encodingUsed,
            'urlUsed': urlUsed,
            'keyword': keyword,
            'links': links,
            'items': items,
            'jsonstr': jsonstr,
            'pageinfo': pageinfo,
            'strpageinfo': json.dumps(pageinfo),
        }
        self._render(templateValues)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: handlers.py Projeto: economylin/newsmonitor

    def post(self):
        action = self.request.get('action')
        keyword = ''
        pageinfo = None
        if action == 'JSON':
            jsonstr = self.request.get('jsonstr')
            if jsonstr:
                newssource = json.loads(jsonstr)
            else:
                newssource = _DEFAULT_NEWSSOURCE
            encodingUsed = ''
            urlUsed = ''
            content = ''
            httpheader = ''
            formatter = ''
        else:
            keyword = self.request.get('keyword').strip()
            pageinfo = self.request.get('pageinfo').strip()
            if pageinfo:
                pageinfo = json.loads(pageinfo)
            newssource = {}
            newssource['active'] = bool(self.request.get('active'))
            newssource['slug'] = self.request.get('slug')
            newssource['name'] = self.request.get('name')
            newssource['order'] = self.request.get('order')
            newssource['charts'] = bool(self.request.get('charts'))
            newssource['fetchurl'] = self.request.get('fetchurl')
            if newssource['fetchurl'] and not newssource[
                    'fetchurl'].startswith('http'):
                newssource['fetchurl'] = 'http://' + newssource['fetchurl']
            if not newssource['slug'] and newssource['fetchurl']:
                newssource['slug'] = urlparse.urlparse(
                    newssource['fetchurl']).netloc
            httpheader = self.request.get('httpheader')
            if httpheader:
                newssource['header'] = json.loads(httpheader)
            newssource['encoding'] = self.request.get('encoding')
            newssource['tags'] = self.request.get('tags')

            # following fields only for showing parsed result.
            encodingUsed = self.request.get('encodingUsed')
            urlUsed = self.request.get('urlUsed')
            oldContent = self.request.get('oldContent')

            newssource['selector'] = self.request.get('selector').strip()
            conditions = {}
            conditions['returnall'] = bool(self.request.get('returnall'))
            conditions['emptytitle'] = bool(self.request.get('emptytitle'))
            conditions['detectdetail'] = bool(self.request.get('detectdetail'))
            conditions['scripttext'] = bool(self.request.get('scripttext'))
            excludeselector = self.request.get('excludeselector').strip()
            if excludeselector:
                if 'exclude' not in conditions:
                    conditions['exclude'] = {}
                conditions['exclude']['selector'] = excludeselector

            includeselector = self.request.get('includeselector').strip()
            if includeselector:
                if 'include' not in conditions:
                    conditions['include'] = {}
                conditions['include']['selector'] = includeselector
            urlselector = self.request.get('urlselector').strip()
            titleselector = self.request.get('titleselector').strip()
            imageselector = self.request.get('imageselector').strip()
            contentselector = self.request.get('contentselector').strip()
            linkselector = self.request.get('linkselector').strip()
            imagelinkselector = self.request.get('imagelinkselector').strip()
            if urlselector or titleselector or contentselector or \
                imageselector or linkselector or imagelinkselector:
                conditions['criterion'] = {}
                if urlselector:
                    conditions['criterion']['url'] = urlselector
                if titleselector:
                    conditions['criterion']['title'] = titleselector
                if contentselector:
                    conditions['criterion']['content'] = contentselector
                if imageselector:
                    conditions['criterion']['image'] = imageselector
                if linkselector:
                    conditions['criterion']['link'] = linkselector
                if imagelinkselector:
                    conditions['criterion']['imagelink'] = imagelinkselector
            newssource['conditions'] = conditions

            formatter = self.request.get('formatter')
            if formatter:
                newssource['formatter'] = json.loads(formatter)

            newssource['description'] = self.request.get('description').strip()

            content = self.request.get('content')
            jsonstr = jsonutil.getReadableString(newssource)

        if 'active' not in newssource:
            newssource['active'] = True

        items = []
        links = []
        selector = newssource.get('selector')
        fetchurl = newssource.get('fetchurl')

        tried = 2  # the max try count is 3
        if not content and fetchurl:
            fetcher = ContentFetcher(fetchurl,
                                     header=newssource.get('header'),
                                     encoding=newssource.get('encoding'),
                                     tried=tried)
            fetchResult = fetcher.fetch()
            content = fetchResult.get('content')
            oldContent = fetchResult.get('content.old')
            urlUsed = fetchResult.get('url')
            encodingUsed = '%s-%s' % (fetchResult.get('encoding'),
                                      fetchResult.get('encoding.src'))
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            if selector:
                parser = HtmlContentParser()
                items = parser.parse(urlUsed, content, selector,
                                     newssource.get('conditions'),
                                     newssource.get('formatter'))
            else:
                links = linkdetector.detect(content, keyword)

        if items and newssource.get('conditions', {}).get('detectdetail'):
            detaildetector.populateDetailUrls(items)

        if newssource.get('header'):
            httpheader = jsonutil.getReadableString(newssource['header'])

        if newssource.get('formatter'):
            formatter = jsonutil.getReadableString(newssource['formatter'])

        if not pageinfo and fetchurl:
            pageinfo = pmapi.getPage(fetchurl)

        templateValues = {
            'newssource': newssource,
            'httpheader': httpheader,
            'formatter': formatter,
            'content': content,
            'oldContent': oldContent,
            'encodingUsed': encodingUsed,
            'urlUsed': urlUsed,
            'keyword': keyword,
            'links': links,
            'items': items,
            'jsonstr': jsonstr,
            'pageinfo': pageinfo,
            'strpageinfo': json.dumps(pageinfo),
        }
        self._render(templateValues)