Пример #1
0
           ]

for url in url_list:
	print url
	req = urllib2.Request(url, headers=hdr)

	try:
		page = urllib2.urlopen(req)
	except urllib2.HTTPError, e:
		#print e.fp.read()
		continue

	# foundtext contains all composer for one alphebet
	foundtext = BeautifulSoup(page.read(), 'html5lib').find('h1')

	for link in foundtext.findAllNext('a'):
		if "classical" in link.get('href'):
			composer_name = link.contents[0]

			# open page contains all songs from one composer
			req = urllib2.Request(link.get('href'), headers=hdr)
			page = urllib2.urlopen(req)
			foundtext2 = BeautifulSoup(page.read(), 'html5lib').find('h1')

			for link2 in foundtext2.findAllNext('a'):
				if "violin.pdf" in link2.get('href'):
					pdf_url_value = link2.get('href')

					values = {'composer':composer_name, 'pdf_url':pdf_url_value}
					#print url
					con.execute("insert into music VALUES (:composer, :pdf_url)", values);
Пример #2
0
import sys, urllib, urllib2, cookielib
from bs4 import BeautifulSoup 
import re
import sqlite3

with open ("bach.html", "r") as myfile:
	data = myfile.read()

foundtext = BeautifulSoup(data, 'html5lib').find('h1')

for urls in foundtext.findAllNext('a'):
	if ".pdf" in urls.get('href'):
		if "violin" in urls.contents[0]:
			url_value = urls.get('href')
			split_result = url_value.split('/')
			print split_result[len(split_result)-1]
			print urls.get('href')
		elif "score, piano" not in urls.contents[0]:
			print urls.contents[0] + urls.get('href')
			

#for link2 in foundtext2.findAllNext('a'):
#	if "violin.pdf" in link2.get('href'):
#		pdf_url_value = link2.get('href')