예제 #1
def fetch(url):
    result = {}
    fetcher = ContentFetcher(url, tried=2)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return result
        htmlelement = lxml.html.fromstring(content)
    except Exception:
        logging.error('Failed to load html from content.')
        return result
    match = pyquery.PyQuery(htmlelement)('head meta[name=keywords]')
    if match:
        mainElement = match[0]
        keywords = mainElement.get('content')
        if keywords:
            result['keywords'] = lxmlutil.getPureString(keywords)
    match = pyquery.PyQuery(htmlelement)('head meta[name=description]')
    if match:
        mainElement = match[0]
        description = mainElement.get('content')
        if description:
            result['description'] = lxmlutil.getPureString(description)
    match = pyquery.PyQuery(htmlelement)('head title')
    if match:
        mainElement = match[0]
        title = mainElement.text_content()
        if title:
            result['title'] = lxmlutil.getPureString(title)
    return result
예제 #2
def _fetchContent(data, triedcount, feedback):
    fetchurl = data['fetchurl']
    header = data.get('header')
    encoding = data.get('encoding')
    fetcher = ContentFetcher(fetchurl, header=header,
                                encoding=encoding, tried=triedcount)
    fetchResult = fetcher.fetch(feedback)
    content = fetchResult.get('content')
    urlUsed = fetchResult.get('url')
    return urlUsed, content
예제 #3
    def post(self):
        data = json.loads(self.request.body)

        items = data['items']
        origin = data['origin']
        header = data.get('header')
        for item in items:
            url = item.get('url')
            if not url:
            fetcher = ContentFetcher(url, header=header,
            fetchResult = fetcher.fetch()
            usedUrl = fetchResult.get('url')
            content = fetchResult.get('content')
            if not content:
                logging.error('Failed to get content from %s.' % (url, ))
            item['url'] = usedUrl
                editorFormat = globalconfig.getEditorFormat()
                page = pageanalyst.analyse(usedUrl, content,
                            editorFormat=editorFormat, monitorTitle=item.get('title'))
                if not item.get('title') and page.get('title'):
                    item['title'] = page['title']
                if not item.get('published') and page.get('published') \
                        and not page['published'].endswith('0000'):
                    # if no hour, minute, published is not precise enough
                    item['published'] = page['published']
                    if origin.get('timezone'):
                        item['published'] = dateutil.adjustDate14(item['published'], origin['timezone'])
                if not item.get('content') and page.get('content'):
                    item['content'] = page['content']
                if not item.get('img') and page.get('images'):
                    item['img'] = page['images'][0]
            except Exception:
                logging.exception('Error happens when analyse %s.' % (usedUrl, ))

        responseData = {
                'origin': data['origin'],
                'items': items,

        self.response.headers['Content-Type'] = 'text/plain'
        callbackurl = data['callbackurl']
        success = networkutil.postData(callbackurl, responseData,
                    trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)

        if success:
            message = 'Push items back for %s to %s.' % (data['origin'], callbackurl)
            message = 'Failed to push items back for %s to %s.' % (data['origin'], callbackurl)
예제 #4
def _fetchContent(data, triedcount, feedback):
    fetchurl = data['fetchurl']
    header = data.get('header')
    encoding = data.get('encoding')
    fetcher = ContentFetcher(fetchurl,
    fetchResult = fetcher.fetch(feedback)
    content = fetchResult.get('content')
    urlUsed = fetchResult.get('url')
    return urlUsed, content
예제 #5
def fetch(url):
    parseresult = urlparse.urlparse(url)
    queryurl = 'http://data.alexa.com/data?cli=10&url=%s' % (parseresult.netloc, )
    result = {}
    fetcher = ContentFetcher(queryurl, tried=2)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return result
    tree = lxmlutil.parseFromUnicode(content)
    alexa = getAlexaInfo(tree)
    if alexa:
        result['alexa'] = alexa
    dmoz = getDmozInfo(tree)
    if dmoz:
        result['dmoz'] = dmoz
    return result
예제 #6
def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url,tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None
예제 #7
def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url, tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None
예제 #8
 def get(self):
     url = self.request.get('url')
     page = None
     if url:
             url = base64.b64decode(url)
             url2 = ''
             length = len(url)
             for i in range(0, length, 2):
                 if i + 1 < length:
                     url2 += url[i+1] + url[i]
             if length % 2 != 0:
                 url2 += url[-1]
             url = url2
         except TypeError:
         key = stringutil.calculateHash([url])
         page = memcache.get(key)
         contentGot = bool(page)
         if not page:
             tried = 2 # the max try count is 3
             fetcher = ContentFetcher(url, tried=tried)
             fetchResult = fetcher.fetch()
             content = fetchResult.get('content')
             if content:
                 editorFormat = globalconfig.getEditorFormat()
                 page = pageanalyst.analyse(url, content, editorFormat=editorFormat)
                 if page:
                     page['url'] = url
                 if page and (page.get('content') or page.get('images')):
                     memcache.set(key, page)
                     contentGot = True
     if not contentGot:
         page = {'url': url}
         self.redirect(url, permanent=True)
     if 'images' in page:
         for image in page['images']:
             image['url'] = '/image/?url=' + urllib.quote(image['url'].encode('utf-8'))
     templateValues = {
         'page': page,
     self.render(templateValues, 'home.html')
예제 #9
 def post(self):
     url = self.request.get('url')
     title = self.request.get('title')
     fetchResult = {}
     content = None
     page = None
     fortest = bool(self.request.get('fortest'))
     httpheader = self.request.get('httpheader')
     header = None
     if httpheader:
         header = json.loads(httpheader)
     if url:
         tried = 2 # the max try count is 3
         fetcher = ContentFetcher(url,
         fetchResult = fetcher.fetch()
         content = fetchResult.get('content')
     elementResult = {}
     if content:
         editorFormat = globalconfig.getEditorFormat()
         page = pageanalyst.analyse(url, content, editorFormat=editorFormat,
                             monitorTitle=title, fortest=fortest, elementResult=elementResult)
     if header:
         httpheader = jsonutil.getReadableString(header)
     templateValues = {
         'url': url,
         'title': title,
         'fortest': fortest,
         'httpheader': httpheader,
         'encoding': fetchResult.get('encoding'),
         'encodingSrc': fetchResult.get('encoding.src'),
         'oldContent': fetchResult.get('content.old'),
         'content': fetchResult.get('content'),
         'pagestr': jsonutil.getReadableString(page),
         'page': page,
         'elementResult': elementResult,
     self.render(templateValues, 'test.html')
예제 #10
    def post(self):
        action = self.request.get('action')
        keyword = ''
        pageinfo = None
        if action == 'JSON':
            jsonstr = self.request.get('jsonstr')
            if jsonstr:
                newssource = json.loads(jsonstr)
                newssource = _DEFAULT_NEWSSOURCE
            encodingUsed = ''
            urlUsed = ''
            content = ''
            httpheader = ''
            formatter = ''
            keyword = self.request.get('keyword').strip()
            pageinfo = self.request.get('pageinfo').strip()
            if pageinfo:
                pageinfo = json.loads(pageinfo)
            newssource = {}
            newssource['active'] = bool(self.request.get('active'))
            newssource['slug'] = self.request.get('slug')
            newssource['name'] = self.request.get('name')
            newssource['order'] = self.request.get('order')
            newssource['charts'] = bool(self.request.get('charts'))
            newssource['fetchurl'] = self.request.get('fetchurl')
            if newssource['fetchurl'] and not newssource['fetchurl'].startswith('http'):
                newssource['fetchurl'] = 'http://' + newssource['fetchurl']
            if not newssource['slug'] and newssource['fetchurl']:
                newssource['slug'] = urlparse.urlparse(newssource['fetchurl']).netloc
            httpheader = self.request.get('httpheader')
            if httpheader:
                newssource['header'] = json.loads(httpheader)
            newssource['encoding'] = self.request.get('encoding')
            newssource['tags'] = self.request.get('tags')

            # following fields only for showing parsed result.
            encodingUsed = self.request.get('encodingUsed')
            urlUsed = self.request.get('urlUsed')
            oldContent = self.request.get('oldContent')

            newssource['selector'] = self.request.get('selector').strip()
            conditions = {}
            conditions['returnall'] = bool(self.request.get('returnall'))
            conditions['emptytitle'] = bool(self.request.get('emptytitle'))
            conditions['detectdetail'] = bool(self.request.get('detectdetail'))
            conditions['scripttext'] = bool(self.request.get('scripttext'))
            excludeselector = self.request.get('excludeselector').strip()
            if excludeselector:
                if 'exclude' not in conditions:
                    conditions['exclude'] = {}
                conditions['exclude']['selector'] = excludeselector

            includeselector = self.request.get('includeselector').strip()
            if includeselector:
                if 'include' not in conditions:
                    conditions['include'] = {}
                conditions['include']['selector'] = includeselector
            urlselector = self.request.get('urlselector').strip()
            titleselector = self.request.get('titleselector').strip()
            imageselector = self.request.get('imageselector').strip()
            contentselector = self.request.get('contentselector').strip()
            linkselector = self.request.get('linkselector').strip()
            imagelinkselector = self.request.get('imagelinkselector').strip()
            if urlselector or titleselector or contentselector or \
                imageselector or linkselector or imagelinkselector:
                conditions['criterion'] = {}
                if urlselector:
                    conditions['criterion']['url'] = urlselector
                if titleselector:
                    conditions['criterion']['title'] = titleselector
                if contentselector:
                    conditions['criterion']['content'] = contentselector
                if imageselector:
                    conditions['criterion']['image'] = imageselector
                if linkselector:
                    conditions['criterion']['link'] = linkselector
                if imagelinkselector:
                    conditions['criterion']['imagelink'] = imagelinkselector
            newssource['conditions'] = conditions

            formatter = self.request.get('formatter')
            if formatter:
                newssource['formatter'] = json.loads(formatter)

            newssource['description'] = self.request.get('description').strip()

            content = self.request.get('content')
            jsonstr = jsonutil.getReadableString(newssource)

        if 'active' not in newssource:
            newssource['active'] = True

        items = []
        links = []
        selector = newssource.get('selector')
        fetchurl = newssource.get('fetchurl')

        tried = 2 # the max try count is 3
        if not content and fetchurl:
            fetcher = ContentFetcher(fetchurl,
                            encoding=newssource.get('encoding'), tried=tried
            fetchResult = fetcher.fetch()
            content = fetchResult.get('content')
            oldContent = fetchResult.get('content.old')
            urlUsed = fetchResult.get('url')
            encodingUsed = '%s-%s' % (fetchResult.get('encoding'),
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            if selector:
                parser = HtmlContentParser()
                items = parser.parse(urlUsed, content, selector,
                newssource.get('conditions'), newssource.get('formatter'))
                links = linkdetector.detect(content, keyword)

        if items and newssource.get('conditions', {}).get('detectdetail'):

        if newssource.get('header'):
            httpheader = jsonutil.getReadableString(newssource['header'])

        if newssource.get('formatter'):
            formatter = jsonutil.getReadableString(newssource['formatter'])

        if not pageinfo and fetchurl:
            pageinfo = pmapi.getPage(fetchurl)

        templateValues = {
            'newssource': newssource,
            'httpheader': httpheader,
            'formatter': formatter,
            'content': content,
            'oldContent': oldContent,
            'encodingUsed': encodingUsed,
            'urlUsed': urlUsed,
            'keyword': keyword,
            'links': links,
            'items': items,
            'jsonstr': jsonstr,
            'pageinfo': pageinfo,
            'strpageinfo': json.dumps(pageinfo),
