def _fetchContent(data, triedcount, feedback): fetchurl = data['fetchurl'] header = data.get('header') encoding = data.get('encoding') fetcher = ContentFetcher(fetchurl, header=header, encoding=encoding, tried=triedcount) fetchResult = fetcher.fetch(feedback) content = fetchResult.get('content') urlUsed = fetchResult.get('url') return urlUsed, content
def _detectDetailUrl(url, title): tried = 2 fetcher = ContentFetcher(url, tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') if not content: return None docelement = lxml.html.fromstring(content) aElements = pyquery.PyQuery(docelement)('a') for aElement in aElements: if lxmlutil.getCleanText(aElement) != title: continue detailUrl = aElement.get('href') if detailUrl: detailUrl = urlparse.urljoin(url, detailUrl) return detailUrl return None
""" Fetch the url provided and retrieve links, subsequently fetching the pages at those links until reaching limit (or running out of links). :param start_url: url to start from :param limit: number of urls to return in list :return: list of urls discovered """ urls = [start_url] seen = {start_url: True} count = 1 while len(urls) > 0 and count < limit: url = urls.pop() contents = self.content_fetcher.retrieve_page(url) new_urls = filter(lambda x: x not in seen, extract_urls(url, contents)) for new_url in new_urls: if count == limit: break urls.append(new_url) seen[new_url] = True count += 1 return list(seen.keys()) if __name__ == "__main__": parser = setup_argument_parser() args = parser.parse_args() web_crawler = WebCrawler(ContentFetcher(args.agents)) found_urls = web_crawler.discover(args.url, limit=args.limit) for url in found_urls: print(url)
def test_get_next_user_agent_cycles_returns_none_when_none_given(self): content_fetcher_no_agents = ContentFetcher([]) self.assertIsNone(content_fetcher_no_agents.get_next_user_agent())
def setUp(self) -> None: self.content_fetcher = ContentFetcher(self.user_agents)
def post(self): action = self.request.get('action') keyword = '' pageinfo = None if action == 'JSON': jsonstr = self.request.get('jsonstr') if jsonstr: newssource = json.loads(jsonstr) else: newssource = _DEFAULT_NEWSSOURCE encodingUsed = '' urlUsed = '' content = '' httpheader = '' formatter = '' else: keyword = self.request.get('keyword').strip() pageinfo = self.request.get('pageinfo').strip() if pageinfo: pageinfo = json.loads(pageinfo) newssource = {} newssource['active'] = bool(self.request.get('active')) newssource['slug'] = self.request.get('slug') newssource['name'] = self.request.get('name') newssource['order'] = self.request.get('order') newssource['charts'] = bool(self.request.get('charts')) newssource['fetchurl'] = self.request.get('fetchurl') if newssource['fetchurl'] and not newssource[ 'fetchurl'].startswith('http'): newssource['fetchurl'] = 'http://' + newssource['fetchurl'] if not newssource['slug'] and newssource['fetchurl']: newssource['slug'] = urlparse.urlparse( newssource['fetchurl']).netloc httpheader = self.request.get('httpheader') if httpheader: newssource['header'] = json.loads(httpheader) newssource['encoding'] = self.request.get('encoding') newssource['tags'] = self.request.get('tags') # following fields only for showing parsed result. encodingUsed = self.request.get('encodingUsed') urlUsed = self.request.get('urlUsed') oldContent = self.request.get('oldContent') newssource['selector'] = self.request.get('selector').strip() conditions = {} conditions['returnall'] = bool(self.request.get('returnall')) conditions['emptytitle'] = bool(self.request.get('emptytitle')) conditions['detectdetail'] = bool(self.request.get('detectdetail')) conditions['scripttext'] = bool(self.request.get('scripttext')) excludeselector = self.request.get('excludeselector').strip() if excludeselector: if 'exclude' not in conditions: conditions['exclude'] = {} conditions['exclude']['selector'] = excludeselector includeselector = self.request.get('includeselector').strip() if includeselector: if 'include' not in conditions: conditions['include'] = {} conditions['include']['selector'] = includeselector urlselector = self.request.get('urlselector').strip() titleselector = self.request.get('titleselector').strip() imageselector = self.request.get('imageselector').strip() contentselector = self.request.get('contentselector').strip() linkselector = self.request.get('linkselector').strip() imagelinkselector = self.request.get('imagelinkselector').strip() if urlselector or titleselector or contentselector or \ imageselector or linkselector or imagelinkselector: conditions['criterion'] = {} if urlselector: conditions['criterion']['url'] = urlselector if titleselector: conditions['criterion']['title'] = titleselector if contentselector: conditions['criterion']['content'] = contentselector if imageselector: conditions['criterion']['image'] = imageselector if linkselector: conditions['criterion']['link'] = linkselector if imagelinkselector: conditions['criterion']['imagelink'] = imagelinkselector newssource['conditions'] = conditions formatter = self.request.get('formatter') if formatter: newssource['formatter'] = json.loads(formatter) newssource['description'] = self.request.get('description').strip() content = self.request.get('content') jsonstr = jsonutil.getReadableString(newssource) if 'active' not in newssource: newssource['active'] = True items = [] links = [] selector = newssource.get('selector') fetchurl = newssource.get('fetchurl') tried = 2 # the max try count is 3 if not content and fetchurl: fetcher = ContentFetcher(fetchurl, header=newssource.get('header'), encoding=newssource.get('encoding'), tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') oldContent = fetchResult.get('content.old') urlUsed = fetchResult.get('url') encodingUsed = '%s-%s' % (fetchResult.get('encoding'), fetchResult.get('encoding.src')) if content: content = lxmlutil.removeEncodingDeclaration(content) if selector: parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, newssource.get('conditions'), newssource.get('formatter')) else: links = linkdetector.detect(content, keyword) if items and newssource.get('conditions', {}).get('detectdetail'): detaildetector.populateDetailUrls(items) if newssource.get('header'): httpheader = jsonutil.getReadableString(newssource['header']) if newssource.get('formatter'): formatter = jsonutil.getReadableString(newssource['formatter']) if not pageinfo and fetchurl: pageinfo = pmapi.getPage(fetchurl) templateValues = { 'newssource': newssource, 'httpheader': httpheader, 'formatter': formatter, 'content': content, 'oldContent': oldContent, 'encodingUsed': encodingUsed, 'urlUsed': urlUsed, 'keyword': keyword, 'links': links, 'items': items, 'jsonstr': jsonstr, 'pageinfo': pageinfo, 'strpageinfo': json.dumps(pageinfo), } self._render(templateValues)