示例#1
0
def fetch(url):
    result = {}
    fetcher = ContentFetcher(url, tried=2)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return result
    try:
        htmlelement = lxml.html.fromstring(content)
    except Exception:
        logging.error('Failed to load html from content.')
        return result
    match = pyquery.PyQuery(htmlelement)('head meta[name=keywords]')
    if match:
        mainElement = match[0]
        keywords = mainElement.get('content')
        if keywords:
            result['keywords'] = lxmlutil.getPureString(keywords)
    match = pyquery.PyQuery(htmlelement)('head meta[name=description]')
    if match:
        mainElement = match[0]
        description = mainElement.get('content')
        if description:
            result['description'] = lxmlutil.getPureString(description)
    match = pyquery.PyQuery(htmlelement)('head title')
    if match:
        mainElement = match[0]
        title = mainElement.text_content()
        if title:
            result['title'] = lxmlutil.getPureString(title)
    return result
示例#2
0
def _fetchContent(data, triedcount, feedback):
    fetchurl = data['fetchurl']
    header = data.get('header')
    encoding = data.get('encoding')
    fetcher = ContentFetcher(fetchurl, header=header,
                                encoding=encoding, tried=triedcount)
    fetchResult = fetcher.fetch(feedback)
    content = fetchResult.get('content')
    urlUsed = fetchResult.get('url')
    return urlUsed, content
示例#3
0
    def post(self):
        data = json.loads(self.request.body)

        items = data['items']
        origin = data['origin']
        header = data.get('header')
        for item in items:
            url = item.get('url')
            if not url:
                continue
            fetcher = ContentFetcher(url, header=header,
                                        tried=2)
            fetchResult = fetcher.fetch()
            usedUrl = fetchResult.get('url')
            content = fetchResult.get('content')
            if not content:
                logging.error('Failed to get content from %s.' % (url, ))
                continue
            item['url'] = usedUrl
            try:
                editorFormat = globalconfig.getEditorFormat()
                page = pageanalyst.analyse(usedUrl, content,
                            editorFormat=editorFormat, monitorTitle=item.get('title'))
                if not item.get('title') and page.get('title'):
                    item['title'] = page['title']
                if not item.get('published') and page.get('published') \
                        and not page['published'].endswith('0000'):
                    # if no hour, minute, published is not precise enough
                    item['published'] = page['published']
                    if origin.get('timezone'):
                        item['published'] = dateutil.adjustDate14(item['published'], origin['timezone'])
                if not item.get('content') and page.get('content'):
                    item['content'] = page['content']
                if not item.get('img') and page.get('images'):
                    item['img'] = page['images'][0]
            except Exception:
                logging.exception('Error happens when analyse %s.' % (usedUrl, ))

        responseData = {
                'origin': data['origin'],
                'items': items,
        }

        self.response.headers['Content-Type'] = 'text/plain'
        callbackurl = data['callbackurl']
        success = networkutil.postData(callbackurl, responseData,
                    trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)

        if success:
            message = 'Push items back for %s to %s.' % (data['origin'], callbackurl)
        else:
            message = 'Failed to push items back for %s to %s.' % (data['origin'], callbackurl)
        logging.info(message)
        self.response.out.write(message)
示例#4
0
def _fetchContent(data, triedcount, feedback):
    fetchurl = data['fetchurl']
    header = data.get('header')
    encoding = data.get('encoding')
    fetcher = ContentFetcher(fetchurl,
                             header=header,
                             encoding=encoding,
                             tried=triedcount)
    fetchResult = fetcher.fetch(feedback)
    content = fetchResult.get('content')
    urlUsed = fetchResult.get('url')
    return urlUsed, content
示例#5
0
def fetch(url):
    parseresult = urlparse.urlparse(url)
    queryurl = 'http://data.alexa.com/data?cli=10&url=%s' % (parseresult.netloc, )
    result = {}
    fetcher = ContentFetcher(queryurl, tried=2)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return result
    tree = lxmlutil.parseFromUnicode(content)
    alexa = getAlexaInfo(tree)
    if alexa:
        result['alexa'] = alexa
    dmoz = getDmozInfo(tree)
    if dmoz:
        result['dmoz'] = dmoz
    return result
示例#6
0
def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url,tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
            continue
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None
示例#7
0
def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url, tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
            continue
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None
示例#8
0
 def get(self):
     url = self.request.get('url')
     page = None
     if url:
         try:
             url = base64.b64decode(url)
             url2 = ''
             length = len(url)
             for i in range(0, length, 2):
                 if i + 1 < length:
                     url2 += url[i+1] + url[i]
             if length % 2 != 0:
                 url2 += url[-1]
             url = url2
         except TypeError:
             pass
         key = stringutil.calculateHash([url])
         page = memcache.get(key)
         contentGot = bool(page)
         if not page:
             tried = 2 # the max try count is 3
             fetcher = ContentFetcher(url, tried=tried)
             fetchResult = fetcher.fetch()
             content = fetchResult.get('content')
             if content:
                 editorFormat = globalconfig.getEditorFormat()
                 page = pageanalyst.analyse(url, content, editorFormat=editorFormat)
                 if page:
                     page['url'] = url
                 if page and (page.get('content') or page.get('images')):
                     memcache.set(key, page)
                     contentGot = True
     if not contentGot:
         page = {'url': url}
         self.redirect(url, permanent=True)
         return
     if 'images' in page:
         for image in page['images']:
             image['url'] = '/image/?url=' + urllib.quote(image['url'].encode('utf-8'))
     templateValues = {
         'page': page,
     }
     self.render(templateValues, 'home.html')
示例#9
0
 def post(self):
     url = self.request.get('url')
     title = self.request.get('title')
     fetchResult = {}
     content = None
     page = None
     fortest = bool(self.request.get('fortest'))
     httpheader = self.request.get('httpheader')
     header = None
     if httpheader:
         header = json.loads(httpheader)
     if url:
         tried = 2 # the max try count is 3
         fetcher = ContentFetcher(url,
                             header=header,
                             tried=tried
                          )
         fetchResult = fetcher.fetch()
         content = fetchResult.get('content')
     elementResult = {}
     if content:
         editorFormat = globalconfig.getEditorFormat()
         page = pageanalyst.analyse(url, content, editorFormat=editorFormat,
                             monitorTitle=title, fortest=fortest, elementResult=elementResult)
     if header:
         httpheader = jsonutil.getReadableString(header)
     templateValues = {
         'url': url,
         'title': title,
         'fortest': fortest,
         'httpheader': httpheader,
         'encoding': fetchResult.get('encoding'),
         'encodingSrc': fetchResult.get('encoding.src'),
         'oldContent': fetchResult.get('content.old'),
         'content': fetchResult.get('content'),
         'pagestr': jsonutil.getReadableString(page),
         'page': page,
         'elementResult': elementResult,
     }
     self.render(templateValues, 'test.html')
示例#10
0
        """
        Fetch the url provided and retrieve links, subsequently fetching
        the pages at those links until reaching limit (or running out of links).
        :param start_url: url to start from
        :param limit: number of urls to return in list
        :return: list of urls discovered
        """
        urls = [start_url]
        seen = {start_url: True}
        count = 1
        while len(urls) > 0 and count < limit:
            url = urls.pop()
            contents = self.content_fetcher.retrieve_page(url)
            new_urls = filter(lambda x: x not in seen, extract_urls(url, contents))
            for new_url in new_urls:
                if count == limit:
                    break
                urls.append(new_url)
                seen[new_url] = True
                count += 1
        return list(seen.keys())


if __name__ == "__main__":
    parser = setup_argument_parser()
    args = parser.parse_args()
    web_crawler = WebCrawler(ContentFetcher(args.agents))
    found_urls = web_crawler.discover(args.url, limit=args.limit)
    for url in found_urls:
        print(url)
示例#11
0
    def post(self):
        action = self.request.get('action')
        keyword = ''
        pageinfo = None
        if action == 'JSON':
            jsonstr = self.request.get('jsonstr')
            if jsonstr:
                newssource = json.loads(jsonstr)
            else:
                newssource = _DEFAULT_NEWSSOURCE
            encodingUsed = ''
            urlUsed = ''
            content = ''
            httpheader = ''
            formatter = ''
        else:
            keyword = self.request.get('keyword').strip()
            pageinfo = self.request.get('pageinfo').strip()
            if pageinfo:
                pageinfo = json.loads(pageinfo)
            newssource = {}
            newssource['active'] = bool(self.request.get('active'))
            newssource['slug'] = self.request.get('slug')
            newssource['name'] = self.request.get('name')
            newssource['order'] = self.request.get('order')
            newssource['charts'] = bool(self.request.get('charts'))
            newssource['fetchurl'] = self.request.get('fetchurl')
            if newssource['fetchurl'] and not newssource['fetchurl'].startswith('http'):
                newssource['fetchurl'] = 'http://' + newssource['fetchurl']
            if not newssource['slug'] and newssource['fetchurl']:
                newssource['slug'] = urlparse.urlparse(newssource['fetchurl']).netloc
            httpheader = self.request.get('httpheader')
            if httpheader:
                newssource['header'] = json.loads(httpheader)
            newssource['encoding'] = self.request.get('encoding')
            newssource['tags'] = self.request.get('tags')

            # following fields only for showing parsed result.
            encodingUsed = self.request.get('encodingUsed')
            urlUsed = self.request.get('urlUsed')
            oldContent = self.request.get('oldContent')

            newssource['selector'] = self.request.get('selector').strip()
            conditions = {}
            conditions['returnall'] = bool(self.request.get('returnall'))
            conditions['emptytitle'] = bool(self.request.get('emptytitle'))
            conditions['detectdetail'] = bool(self.request.get('detectdetail'))
            conditions['scripttext'] = bool(self.request.get('scripttext'))
            excludeselector = self.request.get('excludeselector').strip()
            if excludeselector:
                if 'exclude' not in conditions:
                    conditions['exclude'] = {}
                conditions['exclude']['selector'] = excludeselector

            includeselector = self.request.get('includeselector').strip()
            if includeselector:
                if 'include' not in conditions:
                    conditions['include'] = {}
                conditions['include']['selector'] = includeselector
            urlselector = self.request.get('urlselector').strip()
            titleselector = self.request.get('titleselector').strip()
            imageselector = self.request.get('imageselector').strip()
            contentselector = self.request.get('contentselector').strip()
            linkselector = self.request.get('linkselector').strip()
            imagelinkselector = self.request.get('imagelinkselector').strip()
            if urlselector or titleselector or contentselector or \
                imageselector or linkselector or imagelinkselector:
                conditions['criterion'] = {}
                if urlselector:
                    conditions['criterion']['url'] = urlselector
                if titleselector:
                    conditions['criterion']['title'] = titleselector
                if contentselector:
                    conditions['criterion']['content'] = contentselector
                if imageselector:
                    conditions['criterion']['image'] = imageselector
                if linkselector:
                    conditions['criterion']['link'] = linkselector
                if imagelinkselector:
                    conditions['criterion']['imagelink'] = imagelinkselector
            newssource['conditions'] = conditions

            formatter = self.request.get('formatter')
            if formatter:
                newssource['formatter'] = json.loads(formatter)

            newssource['description'] = self.request.get('description').strip()

            content = self.request.get('content')
            jsonstr = jsonutil.getReadableString(newssource)

        if 'active' not in newssource:
            newssource['active'] = True

        items = []
        links = []
        selector = newssource.get('selector')
        fetchurl = newssource.get('fetchurl')

        tried = 2 # the max try count is 3
        if not content and fetchurl:
            fetcher = ContentFetcher(fetchurl,
                            header=newssource.get('header'),
                            encoding=newssource.get('encoding'), tried=tried
                         )
            fetchResult = fetcher.fetch()
            content = fetchResult.get('content')
            oldContent = fetchResult.get('content.old')
            urlUsed = fetchResult.get('url')
            encodingUsed = '%s-%s' % (fetchResult.get('encoding'),
                                fetchResult.get('encoding.src'))
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            if selector:
                parser = HtmlContentParser()
                items = parser.parse(urlUsed, content, selector,
                newssource.get('conditions'), newssource.get('formatter'))
            else:
                links = linkdetector.detect(content, keyword)

        if items and newssource.get('conditions', {}).get('detectdetail'):
            detaildetector.populateDetailUrls(items)

        if newssource.get('header'):
            httpheader = jsonutil.getReadableString(newssource['header'])

        if newssource.get('formatter'):
            formatter = jsonutil.getReadableString(newssource['formatter'])

        if not pageinfo and fetchurl:
            pageinfo = pmapi.getPage(fetchurl)

        templateValues = {
            'newssource': newssource,
            'httpheader': httpheader,
            'formatter': formatter,
            'content': content,
            'oldContent': oldContent,
            'encodingUsed': encodingUsed,
            'urlUsed': urlUsed,
            'keyword': keyword,
            'links': links,
            'items': items,
            'jsonstr': jsonstr,
            'pageinfo': pageinfo,
            'strpageinfo': json.dumps(pageinfo),
        }
        self._render(templateValues)
示例#12
0
class TestContentFetcher(unittest.TestCase):
    user_agents = ["Mozilla", "Python", "Something Else"]
    url = "https://crawler-test.com/"

    def setUp(self) -> None:
        self.content_fetcher = ContentFetcher(self.user_agents)

    def test_get_next_user_agent_cycles_through_agents(self):
        self.assertEqual(self.content_fetcher.get_next_user_agent(), "Mozilla")
        self.assertEqual(self.content_fetcher.get_next_user_agent(), "Python")
        self.assertEqual(self.content_fetcher.get_next_user_agent(),
                         "Something Else")
        self.assertEqual(self.content_fetcher.get_next_user_agent(), "Mozilla")

    def test_get_next_user_agent_cycles_returns_none_when_none_given(self):
        content_fetcher_no_agents = ContentFetcher([])
        self.assertIsNone(content_fetcher_no_agents.get_next_user_agent())

    def test_construct_request_adds_correct_headers(self):
        request = self.content_fetcher.construct_request(self.url)
        self.assertEqual(request.get_header("User-agent"), "Mozilla")
        self.assertEqual(request.get_header("Accept-encoding"),
                         "gzip, deflate")
        self.assertEqual(request.get_full_url(), self.url)
        self.assertEqual(request.get_header("Accept"), "text/html")

    def test_decompress_content_handles_gzip(self):
        test_bytes = "Compress me".encode('utf-8')
        compressed_data = gzip.compress(test_bytes)
        self.assertEqual(
            self.content_fetcher.decompress_content(compressed_data, "gzip"),
            test_bytes)

    def test_decompress_content_handles_deflate(self):
        test_bytes = "Compress me".encode('utf-8')
        compressed_data = zlib.compress(test_bytes)
        self.assertEqual(
            self.content_fetcher.decompress_content(compressed_data,
                                                    "deflate"), test_bytes)

    def test_decompress_content_recovers_when_unknown_format(self):
        test_bytes = "Compress me".encode('utf-8')
        compressed_data = zlib.compress(test_bytes)
        self.assertEqual(
            self.content_fetcher.decompress_content(compressed_data,
                                                    "unknown"),
            ''.encode('utf-8'))

    def test_handle_response_can_handle_gzip_content(self):
        test_string = "Compress me"
        compressed_data = gzip.compress(test_string.encode('utf-8'))
        headers = [("Content-Encoding", "gzip")]
        self.assertEqual(
            self.content_fetcher.handle_response(headers, compressed_data),
            test_string)

    def test_handle_response_can_handle_deflate_content(self):
        test_string = "Compress me"
        compressed_data = zlib.compress(test_string.encode('utf-8'))
        headers = [("Content-Encoding", "deflate")]
        self.assertEqual(
            self.content_fetcher.handle_response(headers, compressed_data),
            test_string)
示例#13
0
 def test_get_next_user_agent_cycles_returns_none_when_none_given(self):
     content_fetcher_no_agents = ContentFetcher([])
     self.assertIsNone(content_fetcher_no_agents.get_next_user_agent())
示例#14
0
 def setUp(self) -> None:
     self.content_fetcher = ContentFetcher(self.user_agents)
示例#15
0
    def post(self):
        action = self.request.get('action')
        keyword = ''
        pageinfo = None
        if action == 'JSON':
            jsonstr = self.request.get('jsonstr')
            if jsonstr:
                newssource = json.loads(jsonstr)
            else:
                newssource = _DEFAULT_NEWSSOURCE
            encodingUsed = ''
            urlUsed = ''
            content = ''
            httpheader = ''
            formatter = ''
        else:
            keyword = self.request.get('keyword').strip()
            pageinfo = self.request.get('pageinfo').strip()
            if pageinfo:
                pageinfo = json.loads(pageinfo)
            newssource = {}
            newssource['active'] = bool(self.request.get('active'))
            newssource['slug'] = self.request.get('slug')
            newssource['name'] = self.request.get('name')
            newssource['order'] = self.request.get('order')
            newssource['charts'] = bool(self.request.get('charts'))
            newssource['fetchurl'] = self.request.get('fetchurl')
            if newssource['fetchurl'] and not newssource[
                    'fetchurl'].startswith('http'):
                newssource['fetchurl'] = 'http://' + newssource['fetchurl']
            if not newssource['slug'] and newssource['fetchurl']:
                newssource['slug'] = urlparse.urlparse(
                    newssource['fetchurl']).netloc
            httpheader = self.request.get('httpheader')
            if httpheader:
                newssource['header'] = json.loads(httpheader)
            newssource['encoding'] = self.request.get('encoding')
            newssource['tags'] = self.request.get('tags')

            # following fields only for showing parsed result.
            encodingUsed = self.request.get('encodingUsed')
            urlUsed = self.request.get('urlUsed')
            oldContent = self.request.get('oldContent')

            newssource['selector'] = self.request.get('selector').strip()
            conditions = {}
            conditions['returnall'] = bool(self.request.get('returnall'))
            conditions['emptytitle'] = bool(self.request.get('emptytitle'))
            conditions['detectdetail'] = bool(self.request.get('detectdetail'))
            conditions['scripttext'] = bool(self.request.get('scripttext'))
            excludeselector = self.request.get('excludeselector').strip()
            if excludeselector:
                if 'exclude' not in conditions:
                    conditions['exclude'] = {}
                conditions['exclude']['selector'] = excludeselector

            includeselector = self.request.get('includeselector').strip()
            if includeselector:
                if 'include' not in conditions:
                    conditions['include'] = {}
                conditions['include']['selector'] = includeselector
            urlselector = self.request.get('urlselector').strip()
            titleselector = self.request.get('titleselector').strip()
            imageselector = self.request.get('imageselector').strip()
            contentselector = self.request.get('contentselector').strip()
            linkselector = self.request.get('linkselector').strip()
            imagelinkselector = self.request.get('imagelinkselector').strip()
            if urlselector or titleselector or contentselector or \
                imageselector or linkselector or imagelinkselector:
                conditions['criterion'] = {}
                if urlselector:
                    conditions['criterion']['url'] = urlselector
                if titleselector:
                    conditions['criterion']['title'] = titleselector
                if contentselector:
                    conditions['criterion']['content'] = contentselector
                if imageselector:
                    conditions['criterion']['image'] = imageselector
                if linkselector:
                    conditions['criterion']['link'] = linkselector
                if imagelinkselector:
                    conditions['criterion']['imagelink'] = imagelinkselector
            newssource['conditions'] = conditions

            formatter = self.request.get('formatter')
            if formatter:
                newssource['formatter'] = json.loads(formatter)

            newssource['description'] = self.request.get('description').strip()

            content = self.request.get('content')
            jsonstr = jsonutil.getReadableString(newssource)

        if 'active' not in newssource:
            newssource['active'] = True

        items = []
        links = []
        selector = newssource.get('selector')
        fetchurl = newssource.get('fetchurl')

        tried = 2  # the max try count is 3
        if not content and fetchurl:
            fetcher = ContentFetcher(fetchurl,
                                     header=newssource.get('header'),
                                     encoding=newssource.get('encoding'),
                                     tried=tried)
            fetchResult = fetcher.fetch()
            content = fetchResult.get('content')
            oldContent = fetchResult.get('content.old')
            urlUsed = fetchResult.get('url')
            encodingUsed = '%s-%s' % (fetchResult.get('encoding'),
                                      fetchResult.get('encoding.src'))
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            if selector:
                parser = HtmlContentParser()
                items = parser.parse(urlUsed, content, selector,
                                     newssource.get('conditions'),
                                     newssource.get('formatter'))
            else:
                links = linkdetector.detect(content, keyword)

        if items and newssource.get('conditions', {}).get('detectdetail'):
            detaildetector.populateDetailUrls(items)

        if newssource.get('header'):
            httpheader = jsonutil.getReadableString(newssource['header'])

        if newssource.get('formatter'):
            formatter = jsonutil.getReadableString(newssource['formatter'])

        if not pageinfo and fetchurl:
            pageinfo = pmapi.getPage(fetchurl)

        templateValues = {
            'newssource': newssource,
            'httpheader': httpheader,
            'formatter': formatter,
            'content': content,
            'oldContent': oldContent,
            'encodingUsed': encodingUsed,
            'urlUsed': urlUsed,
            'keyword': keyword,
            'links': links,
            'items': items,
            'jsonstr': jsonstr,
            'pageinfo': pageinfo,
            'strpageinfo': json.dumps(pageinfo),
        }
        self._render(templateValues)