Python HtmlDownloader.DownloadHtml示例

编程语言: Python

类/类型: HtmlDownloader

方法/功能: DownloadHtml

hotexamples.com的示例: 10

Python HtmlDownloader.DownloadHtml - 已找到10个示例。这些是从开源项目中提取的最受好评的HtmlDownloader.DownloadHtml 来自程序包 BaiDuBaiKeSpider现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

HtmlDownloader(17)

DownloadHtml(10)

download(6)

Downloader(1)

HtmlDownload(1)

示例#1

显示文件

	def _getWikiBooks(enWikiUrls):
		wikiBooks = []
		for enWikiUrl in enWikiUrls:
			print('Getting wiki books ', enWikiUrl)
			wikiBook = _WikiBook()
			wikiBook.Pages = []
			wikiBookPage = _WikiBookPage()
			wikiBookPage.Language = 'EN'
			wikiBookPage.WikiUrl = enWikiUrl
			wikiBook.Pages.append(wikiBookPage)

			html = HtmlDownloader.DownloadHtml('en.wikipedia.org', enWikiUrl)
			soup = BeautifulSoup(html)
			
			ruLinkElement = soup.find('li', attrs={'class':'interlanguage-link interwiki-ru'})
			if ruLinkElement != None:
				wikiBookPage = _WikiBookPage()
				wikiBookPage.Language = 'RU'
				wikiBookPage.WikiUrl = ruLinkElement.find('a')['href'].replace('//ru.wikipedia.org', '')
				wikiBook.Pages.append(wikiBookPage)

			deLinkElement = soup.find('li', attrs={'class':'interlanguage-link interwiki-de'})
			if deLinkElement != None:
				wikiBookPage = _WikiBookPage()
				wikiBookPage.Language = 'DE'
				wikiBookPage.WikiUrl = deLinkElement.find('a')['href'].replace('//de.wikipedia.org', '')
				wikiBook.Pages.append(wikiBookPage)

			wikiBooks.append(wikiBook)

		return wikiBooks

示例#2

显示文件

def _getLowereadBookPages(bookId):
    pages = []
    pageNum = 1
    while True:
        html = HtmlDownloader.DownloadHtml(
            'loveread.ws',
            '/read_book.php?id=' + bookId + '&p=' + str(pageNum))
        if str(pageNum) != _getLowereadPageNumber(html):
            break
        pages.append(html)
        pageNum += 1
    return pages

示例#3

显示文件

def _getGutenbergSpiegelPages(bookId):
    pages = []
    pageNum = 1

    while True:
        pageHtml = HtmlDownloader.DownloadHtml(
            'gutenberg.spiegel.de', '/buch/' + bookId + '/' + str(pageNum))
        if pageNum > 1 and _isGutenbergSpiegelPageEmpty(pageHtml):
            break
        pages.append(pageHtml)
        pageNum += 1

    return pages

示例#4

显示文件

def _getReadCentraChapterPaths(path):
    chapterPaths = []

    html = HtmlDownloader.DownloadHtml('www.readcentral.com', path)

    soup = BeautifulSoup(html)
    tdElements = soup.find_all('td', attrs={'class': 'bookindex'})
    for tdElement in tdElements:
        linkElement = tdElement.find('a')
        if linkElement != None:
            path = linkElement['href']
            chapterPaths.append(path)

    chapterPaths.sort()
    return chapterPaths

示例#5

显示文件

def _getPathToReadcentralBook(title):
    path = None

    titleFirstLetter = title[0]

    html = HtmlDownloader.DownloadHtml(
        'www.readcentral.com', '/read-online-books/' + titleFirstLetter)

    soup = BeautifulSoup(html)
    tdElements = soup.find_all('td', attrs={'class': 'bookindex'})
    for tdElement in tdElements:
        linkElement = tdElement.find('a')
        linkText = linkElement.string.strip()
        if linkText.lower() == title.lower():
            path = linkElement['href']

    return path

示例#6

显示文件

def _getGutenbergSpiegelBookId(title):
    bookId = None

    title = title.lower()

    indexPageHtml = HtmlDownloader.DownloadHtml('gutenberg.spiegel.de',
                                                '/buch')
    indexPageSoup = BeautifulSoup(indexPageHtml)

    booksElements = indexPageSoup.find('div', attrs={
        'id': 'spTeaserColumn'
    }).find_all('a')
    for bookElement in booksElements:
        if bookElement.string.strip().lower() == title.lower():
            bookId = bookElement['href'].split('/')[2]

    return bookId

示例#7

显示文件

def _getLowereadBookId(title):
    bookId = None

    title = title.lower()
    titleFirstLetter = title[0]

    html = HtmlDownloader.DownloadHtml(
        'loveread.ws',
        '/letter_nav.php?let=' + str((ord(titleFirstLetter) - ord('а')) + 1))

    soup = BeautifulSoup(html)

    booksElement = soup.find('ul', attrs={'class': 'let_ul'})
    if booksElement != None:
        for bookElement in booksElement.find_all('li'):
            if bookElement.a.string.strip().lower() == title.lower():
                bookId = bookElement.a['href'].split('id=')[1]

    return bookId

示例#8

显示文件

def _getReadcentralBookChapter(path):
    chapter = Library.BookChapter()
    chapter.Paragraphs = []

    html = HtmlDownloader.DownloadHtml('www.readcentral.com', path)

    soup = BeautifulSoup(html)
    pageheadElem = soup.find('div', attrs={'id': 'pagehead'})
    chapter.Title = pageheadElem.div.string.strip()

    contentElement = soup.find(
        'div', attrs={'id': 'ctl00_contents_book_chapter_content_area'})
    for paragraphElement in contentElement.find_all('p'):
        paragraphString = ''.join(paragraphElement.strings)
        #paragraphString = re.sub('<[^>]+>', '', paragraphString)
        if paragraphString != None:
            paragraph = Library.BookParagraph()
            paragraph.Sentences = getEnSentencesFromParagraphString(
                paragraphString)
            if len(paragraph.Sentences) != 0:
                chapter.Paragraphs.append(paragraph)

    return chapter

示例#9

显示文件

	def _getBooks(wikiBooks):
		parallelBooks = []
		print(len(wikiBooks))
		for i, wikiBook in enumerate(wikiBooks):
			print(str(i))
			parallelBook = ParallelBook()
			parallelBook.Books = []
			for wikiBookPage in wikiBook.Pages:
				print(wikiBookPage.WikiUrl)

				book = Book()
				book.Language = wikiBookPage.Language

				html = HtmlDownloader.DownloadHtml(wikiBookPage.Language.lower() + '.wikipedia.org', wikiBookPage.WikiUrl)
				soup = BeautifulSoup(html)
				headingElement = soup.find('h1', attrs={'id', 'firstHeading'})
				book.Title = headingElement.text
				book.Title = re.sub('\(.+\)', '', book.Title)

				parallelBook.Books.append(book)

			parallelBooks.append(parallelBook)

		return parallelBooks

示例#10

显示文件

	def _getEnWikiUrls():
		wikiUrls = []

#		conn = http.client.HTTPConnection("en.wikipedia.org")
#		conn.request("GET", "/wiki/The_100_Best_Books_of_All_Time")
#		responce = conn.getresponse()
#		data = responce.read()

#		root = ET.fromstring(data.decode('utf-8'))
#		table = root.find('.//table[@class="wikitable sortable"]')
#		rows = table.findall('tr')
#		for row in rows:
#			print (1)
#			bookElem = row.find('td/i/a')
#			if (bookElem != None):
#				wikiUrl = bookElem.get('href')
#				wikiUrls.append(wikiUrl)

		html = HtmlDownloader.DownloadHtml('en.wikipedia.org', '/wiki/100_Classic_Book_Collection')
		with codecs.open('1.html', 'w', 'utf-8') as target:
			target.write(html)
		soup = BeautifulSoup(html)

		tableElements = soup.find_all('table', attrs={'class':'wikitable sortable'})
		for tableElement in tableElements:
			print(10)
			rowElements= tableElement.find_all('tr')
			for rowElement in rowElements:
				dataElement = rowElement.find('td') 
				if dataElement != None:
					bookElement = dataElement.find('i').find('a')
					if bookElement != None:
						wikiUrl = bookElement['href']
						wikiUrls.append(wikiUrl)

		return list(set(wikiUrls))