示例#1
0
def _fetchContent(data, triedcount, feedback):
    fetchurl = data['fetchurl']
    header = data.get('header')
    encoding = data.get('encoding')
    fetcher = ContentFetcher(fetchurl,
                             header=header,
                             encoding=encoding,
                             tried=triedcount)
    fetchResult = fetcher.fetch(feedback)
    content = fetchResult.get('content')
    urlUsed = fetchResult.get('url')
    return urlUsed, content
示例#2
0
def _detectDetailUrl(url, title):
    tried = 2
    fetcher = ContentFetcher(url, tried=tried)
    fetchResult = fetcher.fetch()
    content = fetchResult.get('content')
    if not content:
        return None
    docelement = lxml.html.fromstring(content)
    aElements = pyquery.PyQuery(docelement)('a')
    for aElement in aElements:
        if lxmlutil.getCleanText(aElement) != title:
            continue
        detailUrl = aElement.get('href')
        if detailUrl:
            detailUrl = urlparse.urljoin(url, detailUrl)
            return detailUrl
    return None
示例#3
0
        """
        Fetch the url provided and retrieve links, subsequently fetching
        the pages at those links until reaching limit (or running out of links).
        :param start_url: url to start from
        :param limit: number of urls to return in list
        :return: list of urls discovered
        """
        urls = [start_url]
        seen = {start_url: True}
        count = 1
        while len(urls) > 0 and count < limit:
            url = urls.pop()
            contents = self.content_fetcher.retrieve_page(url)
            new_urls = filter(lambda x: x not in seen, extract_urls(url, contents))
            for new_url in new_urls:
                if count == limit:
                    break
                urls.append(new_url)
                seen[new_url] = True
                count += 1
        return list(seen.keys())


if __name__ == "__main__":
    parser = setup_argument_parser()
    args = parser.parse_args()
    web_crawler = WebCrawler(ContentFetcher(args.agents))
    found_urls = web_crawler.discover(args.url, limit=args.limit)
    for url in found_urls:
        print(url)
 def test_get_next_user_agent_cycles_returns_none_when_none_given(self):
     content_fetcher_no_agents = ContentFetcher([])
     self.assertIsNone(content_fetcher_no_agents.get_next_user_agent())
 def setUp(self) -> None:
     self.content_fetcher = ContentFetcher(self.user_agents)
示例#6
0
    def post(self):
        action = self.request.get('action')
        keyword = ''
        pageinfo = None
        if action == 'JSON':
            jsonstr = self.request.get('jsonstr')
            if jsonstr:
                newssource = json.loads(jsonstr)
            else:
                newssource = _DEFAULT_NEWSSOURCE
            encodingUsed = ''
            urlUsed = ''
            content = ''
            httpheader = ''
            formatter = ''
        else:
            keyword = self.request.get('keyword').strip()
            pageinfo = self.request.get('pageinfo').strip()
            if pageinfo:
                pageinfo = json.loads(pageinfo)
            newssource = {}
            newssource['active'] = bool(self.request.get('active'))
            newssource['slug'] = self.request.get('slug')
            newssource['name'] = self.request.get('name')
            newssource['order'] = self.request.get('order')
            newssource['charts'] = bool(self.request.get('charts'))
            newssource['fetchurl'] = self.request.get('fetchurl')
            if newssource['fetchurl'] and not newssource[
                    'fetchurl'].startswith('http'):
                newssource['fetchurl'] = 'http://' + newssource['fetchurl']
            if not newssource['slug'] and newssource['fetchurl']:
                newssource['slug'] = urlparse.urlparse(
                    newssource['fetchurl']).netloc
            httpheader = self.request.get('httpheader')
            if httpheader:
                newssource['header'] = json.loads(httpheader)
            newssource['encoding'] = self.request.get('encoding')
            newssource['tags'] = self.request.get('tags')

            # following fields only for showing parsed result.
            encodingUsed = self.request.get('encodingUsed')
            urlUsed = self.request.get('urlUsed')
            oldContent = self.request.get('oldContent')

            newssource['selector'] = self.request.get('selector').strip()
            conditions = {}
            conditions['returnall'] = bool(self.request.get('returnall'))
            conditions['emptytitle'] = bool(self.request.get('emptytitle'))
            conditions['detectdetail'] = bool(self.request.get('detectdetail'))
            conditions['scripttext'] = bool(self.request.get('scripttext'))
            excludeselector = self.request.get('excludeselector').strip()
            if excludeselector:
                if 'exclude' not in conditions:
                    conditions['exclude'] = {}
                conditions['exclude']['selector'] = excludeselector

            includeselector = self.request.get('includeselector').strip()
            if includeselector:
                if 'include' not in conditions:
                    conditions['include'] = {}
                conditions['include']['selector'] = includeselector
            urlselector = self.request.get('urlselector').strip()
            titleselector = self.request.get('titleselector').strip()
            imageselector = self.request.get('imageselector').strip()
            contentselector = self.request.get('contentselector').strip()
            linkselector = self.request.get('linkselector').strip()
            imagelinkselector = self.request.get('imagelinkselector').strip()
            if urlselector or titleselector or contentselector or \
                imageselector or linkselector or imagelinkselector:
                conditions['criterion'] = {}
                if urlselector:
                    conditions['criterion']['url'] = urlselector
                if titleselector:
                    conditions['criterion']['title'] = titleselector
                if contentselector:
                    conditions['criterion']['content'] = contentselector
                if imageselector:
                    conditions['criterion']['image'] = imageselector
                if linkselector:
                    conditions['criterion']['link'] = linkselector
                if imagelinkselector:
                    conditions['criterion']['imagelink'] = imagelinkselector
            newssource['conditions'] = conditions

            formatter = self.request.get('formatter')
            if formatter:
                newssource['formatter'] = json.loads(formatter)

            newssource['description'] = self.request.get('description').strip()

            content = self.request.get('content')
            jsonstr = jsonutil.getReadableString(newssource)

        if 'active' not in newssource:
            newssource['active'] = True

        items = []
        links = []
        selector = newssource.get('selector')
        fetchurl = newssource.get('fetchurl')

        tried = 2  # the max try count is 3
        if not content and fetchurl:
            fetcher = ContentFetcher(fetchurl,
                                     header=newssource.get('header'),
                                     encoding=newssource.get('encoding'),
                                     tried=tried)
            fetchResult = fetcher.fetch()
            content = fetchResult.get('content')
            oldContent = fetchResult.get('content.old')
            urlUsed = fetchResult.get('url')
            encodingUsed = '%s-%s' % (fetchResult.get('encoding'),
                                      fetchResult.get('encoding.src'))
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            if selector:
                parser = HtmlContentParser()
                items = parser.parse(urlUsed, content, selector,
                                     newssource.get('conditions'),
                                     newssource.get('formatter'))
            else:
                links = linkdetector.detect(content, keyword)

        if items and newssource.get('conditions', {}).get('detectdetail'):
            detaildetector.populateDetailUrls(items)

        if newssource.get('header'):
            httpheader = jsonutil.getReadableString(newssource['header'])

        if newssource.get('formatter'):
            formatter = jsonutil.getReadableString(newssource['formatter'])

        if not pageinfo and fetchurl:
            pageinfo = pmapi.getPage(fetchurl)

        templateValues = {
            'newssource': newssource,
            'httpheader': httpheader,
            'formatter': formatter,
            'content': content,
            'oldContent': oldContent,
            'encodingUsed': encodingUsed,
            'urlUsed': urlUsed,
            'keyword': keyword,
            'links': links,
            'items': items,
            'jsonstr': jsonstr,
            'pageinfo': pageinfo,
            'strpageinfo': json.dumps(pageinfo),
        }
        self._render(templateValues)