def getUrls(url, tag_id): html = getDouban_module.getHtml(url) # print html pages = int(getDouban_module.getTotalPage(html)) # print pages if pages == 0: getUrl(html, tag_id) db.setTmpIf(1,tag_id) else: for i in range(pages): url2 = url + '?start=' + str(20*i) + '&type=T' html2 = getDouban_module.getHtml(url2) getUrl(html2, tag_id) # if i == 5: # print 'over' # break # update tmp_tags tag_if db.setTmpIf(1,tag_id)
def getUrls(url, tag_id): html = getDouban_module.getHtml(url) # print html pages = int(getDouban_module.getTotalPage(html)) # print pages if pages == 0: getUrl(html, tag_id) db.setTmpIf(1, tag_id) else: for i in range(pages): url2 = url + '?start=' + str(20 * i) + '&type=T' html2 = getDouban_module.getHtml(url2) getUrl(html2, tag_id) # if i == 5: # print 'over' # break # update tmp_tags tag_if db.setTmpIf(1, tag_id)
def getTmpContent(): print 'Please enter a number:' n = raw_input() urls = db.selectTmpFilm(int(n)) minute = 5 * 60 start = int(time.time()) for ur in urls: current_time = int(time.time()) cost = current_time - start if ((cost != 0) and (cost % minute == 0)): second = random.uniform(60, 180) print 'sleep ' + str(second) + '.....' time.sleep(second) print 'wake up!Start to work...' url = ur[3] tag_id = ur[2] film_id = ur[0] # print film_id # return False try: html = getDouban_module.getHtml(url) # global req_header # html = cheat_get_html(url,req_header) except: msg = 'get ' + str(tag_id) + ' ' + url + ' faild\n' filename = 'filmError.txt' print 'Error:' + msg + '\n' getDouban_module.saveData(msg, filename) error_if = 1 db.setTempFilmError(film_id, error_if) continue tmp_title = getDouban_module.getTitle(html) # print title tmp_info = getDouban_module.getInfo(html) tmp_related_info = getDouban_module.getRelatedInfo(html) db.saveTmpFilmContent(tag_id, tmp_title, tmp_info, tmp_related_info) db.setTmpFilmIf(1, film_id) print tmp_title + ' save success' print 'for loop is over'
def getTmpContent(): print 'Please enter a number:' n = raw_input() urls = db.selectTmpFilm(int(n)) minute = 5*60 start = int(time.time()) for ur in urls: current_time = int(time.time()) cost = current_time - start if( (cost != 0) and (cost%minute == 0)): second = random.uniform(60, 180) print 'sleep ' + str(second) + '.....' time.sleep(second) print 'wake up!Start to work...' url = ur[3] tag_id = ur[2] film_id = ur[0] # print film_id # return False try: html = getDouban_module.getHtml(url) # global req_header # html = cheat_get_html(url,req_header) except: msg = 'get ' + str(tag_id) + ' ' + url + ' faild\n' filename = 'filmError.txt' print 'Error:' + msg + '\n' getDouban_module.saveData(msg,filename) error_if = 1 db.setTempFilmError(film_id,error_if) continue tmp_title = getDouban_module.getTitle(html) # print title tmp_info = getDouban_module.getInfo(html) tmp_related_info = getDouban_module.getRelatedInfo(html) db.saveTmpFilmContent(tag_id, tmp_title, tmp_info, tmp_related_info) db.setTmpFilmIf(1,film_id) print tmp_title + ' save success' print 'for loop is over'
# Filename: getDouban_tags.py import getDouban_module from MyDB import MyDB host = 'localhost' root = 'root' pwd = '' db = 'movies' chset = 'utf8' db = MyDB(host,root,pwd,db,chset) url = 'http://movie.douban.com/tag/?view=type' pre = 'http://movie.douban.com/tag/' html = getDouban_module.getHtml(url) tagsHtml = getDouban_module.getTagsHtml(html) # print tagsHtml i = 1 j = 0 for tagHtml in tagsHtml: # f = file('tagUrl.txt','a+') # f.write('\n------------' + str(i) + '------------------\n') tagUrl = getDouban_module.getUrl(tagHtml) for tu in tagUrl: # f.write('\n' + tu[1] + '\n') # f.write('\n' + pre + tu[1] + '\n') kind_id = i tag_name = tu[1] tag_url = pre + tu[1] db.saveTagUrl(kind_id, tag_name, tag_url) j = j + 1
# Filename: getDouban_tags.py import getDouban_module from MyDB import MyDB host = 'localhost' root = 'root' pwd = '' db = 'movies' chset = 'utf8' db = MyDB(host, root, pwd, db, chset) url = 'http://movie.douban.com/tag/?view=type' pre = 'http://movie.douban.com/tag/' html = getDouban_module.getHtml(url) tagsHtml = getDouban_module.getTagsHtml(html) # print tagsHtml i = 1 j = 0 for tagHtml in tagsHtml: # f = file('tagUrl.txt','a+') # f.write('\n------------' + str(i) + '------------------\n') tagUrl = getDouban_module.getUrl(tagHtml) for tu in tagUrl: # f.write('\n' + tu[1] + '\n') # f.write('\n' + pre + tu[1] + '\n') kind_id = i tag_name = tu[1] tag_url = pre + tu[1] db.saveTagUrl(kind_id, tag_name, tag_url) j = j + 1