示例#1
0
def getUrls(url,	tag_id):
	html = getDouban_module.getHtml(url)
	# print html
	pages = int(getDouban_module.getTotalPage(html))
	# print pages
	if pages == 0:
		getUrl(html,	tag_id)
		db.setTmpIf(1,tag_id)
	else:
		for i in range(pages):
			url2 = url + '?start=' + str(20*i) + '&type=T'
			html2 = getDouban_module.getHtml(url2)
			getUrl(html2,	tag_id)
			# if i == 5:
				# print 'over'
				# break
			# update tmp_tags tag_if
		db.setTmpIf(1,tag_id)
示例#2
0
def getUrls(url, tag_id):
    html = getDouban_module.getHtml(url)
    # print html
    pages = int(getDouban_module.getTotalPage(html))
    # print pages
    if pages == 0:
        getUrl(html, tag_id)
        db.setTmpIf(1, tag_id)
    else:
        for i in range(pages):
            url2 = url + '?start=' + str(20 * i) + '&type=T'
            html2 = getDouban_module.getHtml(url2)
            getUrl(html2, tag_id)
            # if i == 5:
            # print 'over'
            # break
            # update tmp_tags tag_if
        db.setTmpIf(1, tag_id)
def getTmpContent():
    print 'Please enter a number:'
    n = raw_input()
    urls = db.selectTmpFilm(int(n))

    minute = 5 * 60

    start = int(time.time())
    for ur in urls:
        current_time = int(time.time())
        cost = current_time - start
        if ((cost != 0) and (cost % minute == 0)):
            second = random.uniform(60, 180)
            print 'sleep ' + str(second) + '.....'
            time.sleep(second)
            print 'wake up!Start to work...'

        url = ur[3]
        tag_id = ur[2]
        film_id = ur[0]
        # print film_id
        # return False
        try:
            html = getDouban_module.getHtml(url)
            # global req_header
            # html = cheat_get_html(url,req_header)
        except:
            msg = 'get ' + str(tag_id) + ' ' + url + ' faild\n'
            filename = 'filmError.txt'
            print 'Error:' + msg + '\n'
            getDouban_module.saveData(msg, filename)
            error_if = 1
            db.setTempFilmError(film_id, error_if)
            continue
        tmp_title = getDouban_module.getTitle(html)
        # print title
        tmp_info = getDouban_module.getInfo(html)
        tmp_related_info = getDouban_module.getRelatedInfo(html)
        db.saveTmpFilmContent(tag_id, tmp_title, tmp_info, tmp_related_info)
        db.setTmpFilmIf(1, film_id)
        print tmp_title + ' save success'
    print 'for loop is over'
def getTmpContent():
	print 'Please enter a number:'
	n = raw_input()
	urls = db.selectTmpFilm(int(n))
	
	minute = 5*60
	
	start = int(time.time())
	for ur in urls:	
		current_time = int(time.time())
		cost = current_time - start
		if( (cost != 0) and (cost%minute == 0)):		
			second = random.uniform(60, 180)
			print 'sleep ' + str(second) + '.....'			
			time.sleep(second)
			print 'wake up!Start to work...'
		
		url = ur[3]
		tag_id = ur[2]
		film_id = ur[0]	
		# print film_id
		# return False
		try:
			html = getDouban_module.getHtml(url)
			# global req_header
			# html = cheat_get_html(url,req_header)
		except:
			msg = 'get ' + str(tag_id) + ' ' + url + ' faild\n'
			filename = 'filmError.txt'
			print 'Error:' + msg + '\n'
			getDouban_module.saveData(msg,filename)
			error_if = 1
			db.setTempFilmError(film_id,error_if)
			continue
		tmp_title = getDouban_module.getTitle(html)
		# print title
		tmp_info = getDouban_module.getInfo(html)
		tmp_related_info = getDouban_module.getRelatedInfo(html)
		db.saveTmpFilmContent(tag_id, tmp_title, tmp_info,  tmp_related_info)
		db.setTmpFilmIf(1,film_id)
		print tmp_title + ' save success'
	print 'for loop is over'		
示例#5
0
# Filename: getDouban_tags.py

import getDouban_module
from MyDB import MyDB

host = 'localhost'
root = 'root'
pwd = ''
db = 'movies'
chset = 'utf8'
db = MyDB(host,root,pwd,db,chset)

url = 'http://movie.douban.com/tag/?view=type'
pre = 'http://movie.douban.com/tag/'
html = getDouban_module.getHtml(url)
tagsHtml = getDouban_module.getTagsHtml(html)
# print tagsHtml
i = 1
j = 0
for tagHtml in tagsHtml:
	# f = file('tagUrl.txt','a+')
	# f.write('\n------------' + str(i) + '------------------\n')
	tagUrl = getDouban_module.getUrl(tagHtml)
	for tu in tagUrl:		
		# f.write('\n' + tu[1] + '\n')
		# f.write('\n' + pre + tu[1] + '\n')
		kind_id = i
		tag_name = tu[1]
		tag_url = pre + tu[1]
		db.saveTagUrl(kind_id, tag_name, tag_url)	
		j = j + 1
示例#6
0
# Filename: getDouban_tags.py

import getDouban_module
from MyDB import MyDB

host = 'localhost'
root = 'root'
pwd = ''
db = 'movies'
chset = 'utf8'
db = MyDB(host, root, pwd, db, chset)

url = 'http://movie.douban.com/tag/?view=type'
pre = 'http://movie.douban.com/tag/'
html = getDouban_module.getHtml(url)
tagsHtml = getDouban_module.getTagsHtml(html)
# print tagsHtml
i = 1
j = 0
for tagHtml in tagsHtml:
    # f = file('tagUrl.txt','a+')
    # f.write('\n------------' + str(i) + '------------------\n')
    tagUrl = getDouban_module.getUrl(tagHtml)
    for tu in tagUrl:
        # f.write('\n' + tu[1] + '\n')
        # f.write('\n' + pre + tu[1] + '\n')
        kind_id = i
        tag_name = tu[1]
        tag_url = pre + tu[1]
        db.saveTagUrl(kind_id, tag_name, tag_url)
        j = j + 1