예제 #1
0
파일: branch2.py 프로젝트: Lufay/crawl_seed
def download_torrent(url, logfile=sys.stderr):
	content = trunk.open_page(url)
	soup = BeautifulSoup(content)
	form = soup2.find('form')
#	durl = posixpath.normpath(posixpath.join(posixpath.dirname(url), form['action']))
	durl = urlparse.urljoin(url, form['action'])
	datas = form('input', {'type':'hidden'})
	data = {}
	for item in datas:
		data[item['name']] = item['value'].encode('utf8')
	postdata = urllib.urlencode(data)
	print postdata, len(postdata)
	hd = trunk.header.copy()
	hd.update({
		'Content-Type': 'application/x-www-form-urlencoded',
		'Content-Length': len(postdata),
		'Referer': str(url),
		})
	return trunk.download(durl, postdata, hd, logfile=logfile)
예제 #2
0
파일: branch2.py 프로젝트: Lufay/crawl_seed
def crawl_subject(short_url, only_torrent=False, logfile=sys.stdout):
	url = "%s%s" % (domain, short_url)
	content = trunk.open_page(url)
	soup = BeautifulSoup(content)
	sps = soup('span', class_='bold', text=page_pattern)
	if len(sps) != 1:
		logfile.write("Error: can't find the title!\n")
		return False, "Can't find the title"
	mc = sps[0].find_next_siblings('div')
	if len(mc) != 1:
		logfile.wrire("Error: There's more than one div!\n")
		return False, "More than one div"
	if only_torrent:
		logfile.write(mc.string.encode('gbk'))
		for dpage in soup('a', href=href_pattern):
			download_torrent(dpage['href'], logfile)
		return True, sps[0].string.encode('gbk')
	dir_seq = 1
	os.mkdir(str(dir_seq))
	os.chdir(str(dir_seq))
	for child in mc[0].descendants:
		if isinstance(child, element.NavigableString):
			logfile.write(child.encode('gbk'))
		elif isinstance(child, element.Tag):
			if child.name == 'br':
				logfile.write('\n')
			elif child.name == 'img':
				trunk.download(child['src'], logfile=logfile)
			elif child.name == 'a' and href_pattern.search(child['href']):
				fn = download_torrent(child['href'], logfile)
				logfile.write('Write the file %s\n' % fn)
				dir_seq += 1
				os.chdir('..')
				os.mkdir(str(dir_seq))
				os.chdir(str(dir_seq))
		else:
			logfile.write('child type error!!!')
	os.chdir('..')
	os.rmdir(str(dir_seq))
	return True, sps[0].string.encode('gbk')
예제 #3
0
파일: branch.py 프로젝트: Lufay/crawl_seed
def crawl_page(page_id=1, clf=sys.stdout):
	content = trunk.open_page('%s%s%d' % (domain, pathquery, page_id))
예제 #4
0
파일: branch.py 프로젝트: Lufay/crawl_seed
import trunk

domain = 'http://99btgc01.com/'
pathquery = 'forumdisplay.php?fid=21&page='
header = trunk.header
fid_id = {'weimei': 13, 'zipai':9, 'oumei':10}

# for picture
pathquery = pathquery.replace('21', str(fid_id['oumei'])

#page_pattern = re.compile(ur'\[\d{2}-\d{2}\]')
href_pattern = re.compile(ur'viewthread\.php\?tid=\d+.*extra=page%3D1$')

def crawl_subject(short_url, with_jpg=True, logfile=sys.stdout):
	url = "%s%s" % (domain, short_url)
	content = trunk.open_page(url)
	soup = BeautifulSoup(content)
	for img in soup('img', onclick=True):
		if trunk.download(img['src'], logfile=logfile) == 'existed':
			break

def crawl_content(content, clf=sys.stdout):
	soup = BeautifulSoup(content)
	for a in reversed(soup('a', href=href_pattern, title=None, style=None)):
#		clf.write('%s\n' % a.encode('gbk'))
		print a.encode('gbk')
		now = str(time.time())
		os.mkdir(now)
		os.chdir(now)
		crawl_subject(a['href'], logfile=clf)
		os.chdir('..')
예제 #5
0
파일: branch2.py 프로젝트: Lufay/crawl_seed
def crawl_page(subject='latest', page_id=1, clf=sys.stdout):
	content = trunk.open_page(domain + pathquery % (fid_id[subject], page_id))
	crawl_content(content, clf)