class Controller(object): """docstring for Controller""" def __init__(self, movie_name): super(Controller, self).__init__() self.session = login() self.movie_name = movie_name self.movie = Movie(self.movie_name, self.session) def start(self): start_time = datetime.datetime.strptime(START_TIME, '%Y-%m-%d-%H') end_time = datetime.datetime.strptime(END_TIME, '%Y-%m-%d-%H') if start_time < end_time: while True: time.sleep(3) cursor_time = start_time + datetime.timedelta(hours=1) timescope = datetime.datetime.strftime(start_time, '%Y-%m-%d-%H') +':' + datetime.datetime.strftime(cursor_time, '%Y-%m-%d-%H') url = 'http://s.weibo.com/weibo/'+ quote(self.movie_name) +'×cope=custom:'+ quote(timescope) response = self.session.get(url) count = self.get_page_count(response.content) self.handle_one_page(url, count, self.movie.id) start_time = cursor_time if start_time > end_time: break else: print 'ERROR:Start time must early than end time!' def get_page_count(self, content): max_count = 0 result = re.findall(r'&page=\d+', content) for index in result: info = index.split('=') if int(info[1]) > max_count: max_count = int(info[1]) print 'max_page:', max_count return max_count def handle_one_page(self, url_head, page_count, movie_id): for i in xrange(1, page_count + 1): page = str(i) url = url_head +"&page=" + page response = self.session.get(url) self.movie.decode_content(response.content, movie_id) print "现在是第%s页" % page time.sleep(3)
class Controller(object): """docstring for Controller""" def __init__(self, movie_name): super(Controller, self).__init__() self.session = login() self.movie_name = movie_name self.movie = Movie(self.movie_name, self.session) #self.url = 'http://s.weibo.com/weibo/' self.url = 'http://s.weibo.com/wb/' self.page_count = 0 def start(self): start_time = datetime.datetime.strptime(START_DAY, '%Y-%m-%d') end_time = datetime.datetime.strptime(END_DAY, '%Y-%m-%d') if start_time <= end_time: print 'Name:', self.movie_name print 'Time:', START_DAY,'--',END_DAY self.search_nomarl(start_time, end_time) else: print 'ERROR:起始时间必须比结束时间早!' def search_nomarl(self, start_time, end_time): '''普通搜索,All时间''' print 'search_nomarl:::' url = self.url+ quote(self.movie_name) start_time = datetime.datetime.strftime(start_time, '%Y-%m-%d') end_time = datetime.datetime.strftime(end_time, '%Y-%m-%d') while True: timescope = start_time +':' + end_time url = self.url+ quote(self.movie_name) +'×cope=custom:'+ quote(timescope)+ '&xsort=time&nodup=1' print url page_content = self.session.get(url).content count = self.get_page_count(page_content) if count == 0: if self.is_rebot(page_content): print '变机器人了,需要帮助, sleep %ss' % REBOT_SLEEP_TIME time.sleep(REBOT_SLEEP_TIME) continue else: print '数据0页~~!' return self.handle_pages(url, count, self.movie.id, start_time, end_time) elif count < 50: print '数据小于50页,直接爬取' return self.handle_pages(url, count, self.movie.id, start_time, end_time) #如果页数超过50 elif count == 50: print '数据等于50页,分片爬取' self.search_by_day(start_time, end_time) return 'End' def search_by_day(self, start_time, end_time): '''根据天做分割,如果在普通搜索不能完成时间段所有信息的时候使用''' print 'search_by_day:::' start_time = datetime.datetime.strptime(start_time, '%Y-%m-%d') end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d') while True: timescope = datetime.datetime.strftime(end_time, '%Y-%m-%d') timescope = timescope +':' + timescope url = self.url+ quote(self.movie_name) +'×cope=custom:'+ quote(timescope)+ '&xsort=time&nodup=1' print url while True: content = self.session.get(url).content count = self.get_page_count(content) if count == 0: if self.is_rebot(content): print '变机器人了,需要帮助, sleep %ss' % REBOT_SLEEP_TIME time.sleep(REBOT_SLEEP_TIME) continue else: print '数据0页~~!' #如果页数低于50,直接爬取 elif count < 50: print '数据小于50页,直接爬取' self.handle_pages(url, count, self.movie.id, end_time, end_time) break #如果页数超过50,分小时爬取 elif count == 50: print '数据等于50页,分片爬取' self.search_by_hour(end_time) break end_time = end_time - datetime.timedelta(days=1) if start_time > end_time: break print 'while sleep %ss...' % SEARCH_SLEEP_TIME time.sleep(SEARCH_SLEEP_TIME) def search_by_hour(self, day): '''针对一天按小时分割搜索''' print "search_by_hour:::" day = datetime.datetime.strftime(day, '%Y-%m-%d') l_hour = [i for i in range(0 ,25)] #1-24 while True: i_day = l_hour.pop() print 'i_day',i_day if i_day == 24: end_time = datetime.datetime.strptime(day + '-23', '%Y-%m-%d-%H') # end_time = end_time - datetime.timedelta(hours = 1) elif i_day == 0: break else : end_time = datetime.datetime.strptime(day + '-' + str(i_day), '%Y-%m-%d-%H') end_time = end_time - datetime.timedelta(hours = 1) start_time = end_time #- datetime.timedelta(hours=1) timescope = datetime.datetime.strftime(start_time, '%Y-%m-%d-%H') timescope = timescope +':' timescope = timescope + datetime.datetime.strftime(end_time, '%Y-%m-%d-%H') url = self.url+ quote(self.movie_name) +'×cope=custom:'+ quote(timescope) +'&xsort=time&nodup=1' print url while True: content = self.session.get(url).content count = self.get_page_count(content) if int(count) == 0 or count == '0': if self.is_rebot(content): print '变机器人了,需要帮助,while sleep %ss...' % SEARCH_SLEEP_TIME time.sleep(REBOT_SLEEP_TIME) continue else: print '数据0页~~!' self.handle_pages(url, count, self.movie.id, start_time, end_time) break print 'while sleep %ss...' % SEARCH_SLEEP_TIME time.sleep(SEARCH_SLEEP_TIME) def get_page_count(self, content): max_count = 0 result = re.findall(r'&page=(\d+)', content) result.append(0) max_count = max([int(i) for i in result]) print 'All Page:', result, 'Max Page:-->', max_count return max_count def is_rebot(self, content): content = self.format_content(content) result = re.findall(r'我真滴不是机器人', content) if len(result) > 0: from send_mail import send_mail send_mail('机器人', '快填验证码!') return True else: return False def handle_pages(self, url_head, page_count, movie_id, start_time, end_time): earliest_time = False if page_count == 0: page_count = 1 for i in xrange(1, page_count + 1): self.page_count += 1 url = url_head +"&page=%s" % i print 'Start Page','-'*90, '->:%02d' % i,'/', '%02d .' % page_count, 'All Data Page:',self.page_count print url isreboot = False isear = False error_num = 0 while True: response = self.session.get(url) content = response.content isreboot = self.is_rebot(content) if isreboot: print '变机器人了,需要帮助,while sleep %ss...' % REBOT_SLEEP_TIME time.sleep(REBOT_SLEEP_TIME) continue earliest_time = self.movie.decode_content(content, movie_id, start_time, end_time) if not earliest_time: isear = True break if earliest_time == 'page_error' : time.sleep(1) error_num +=1 if error_num <= 1: continue else : break n = random.choice([1,2,3,3,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,10]) print 'End Page','-'*90, '->:%02d' % i,'/', '%02d' % page_count, 'while sleep %ss...\n' % n time.sleep(n) break if isear or error_num >= 3: break time.sleep(2) return earliest_time def format_content(self, content): r = content.decode('unicode_escape').encode("utf-8") return r.replace("\/", "/")
class Controller(object): """docstring for Controller""" def __init__(self, movie_name): super(Controller, self).__init__() self.session = login() self.movie_name = movie_name self.movie = Movie(self.movie_name, self.session) def start(self): start_time = datetime.datetime.strptime(START_DAY + '-0', '%Y-%m-%d-%H') end_time = datetime.datetime.strptime(END_DAY + '-23', '%Y-%m-%d-%H') if start_time < end_time: earliest_time = self.search_nomarl(start_time, end_time) print '普通搜索获取到的最早时间:',earliest_time if earliest_time and earliest_time > start_time: print '普通搜索没有搜索到足够数据,继续往前爬取!' self.search_by_day(start_time, earliest_time) else: print 'ERROR:起始时间必须比结束时间早!' def search_nomarl(self, start_time, end_time): '''普通搜索,不设置时间''' url = 'http://s.weibo.com/weibo/'+ quote(self.movie_name) print url while True: print 'url -------- true:' page_content = self.session.get(url).content with file('tttttt','w') as f: f.write(page_content) print 'file tttttt save ok..~!' count = self.get_page_count(page_content) if count == 0: if self.is_rebot(page_content): print '变机器人了,需要帮助!!' time.sleep(REBOT_SLEEP_TIME) continue return self.handle_pages(url, count, self.movie.id, start_time, end_time) def search_by_day(self, start_time, end_time): '''根据天做分割,如果在普通搜索不能完成时间段所有信息的时候使用''' end_time = datetime.datetime.strftime(end_time, '%Y-%m-%d') end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d') while True: timescope = datetime.datetime.strftime(end_time, '%Y-%m-%d') +':' + datetime.datetime.strftime(end_time, '%Y-%m-%d') url = 'http://s.weibo.com/weibo/'+ quote(self.movie_name) +'×cope=custom:'+ quote(timescope) print url while True: content = self.session.get(url).content count = self.get_page_count(content) if count == 0: if self.is_rebot(content): print '变机器人了,需要帮助!!' time.sleep(REBOT_SLEEP_TIME) continue else: print '数据0页~~!' break #如果页数低于50,直接爬取 elif count < 50: print '数据小于50页,直接爬取' break #如果页数超过50,分小时爬取 elif count == 50: print '数据等于50页,分片爬取' self.search_by_hour(end_time) continue day = datetime.datetime.strftime(end_time, '%Y-%m-%d') search_start_time = datetime.datetime.strptime(day + '-0', '%Y-%m-%d-%H') search_end_time = datetime.datetime.strptime(day + '-23', '%Y-%m-%d-%H') self.handle_pages(url, count, self.movie.id, search_start_time, search_end_time) end_time = end_time - datetime.timedelta(days=1) if start_time > end_time: break time.sleep(SEARCH_SLEEP_TIME) def search_by_hour(self, day): '''针对一天按小时分割搜索''' day = datetime.datetime.strftime(day, '%Y-%m-%d') start_time = datetime.datetime.strptime(day + '-0', '%Y-%m-%d-%H') end_time = datetime.datetime.strptime(day + '-23', '%Y-%m-%d-%H') while True: timescope = datetime.datetime.strftime(end_time, '%Y-%m-%d-%H') +':' + datetime.datetime.strftime(end_time, '%Y-%m-%d-%H') url = 'http://s.weibo.com/weibo/'+ quote(self.movie_name) +'×cope=custom:'+ quote(timescope) print url while True: content = self.session.get(url).content count = self.get_page_count(content) if count == 0: if self.is_rebot(content): print '变机器人了,需要帮助!!' time.sleep(REBOT_SLEEP_TIME) continue else: break else: break hour = datetime.datetime.strftime(end_time, '%Y-%m-%d-%H') search_start_time = datetime.datetime.strptime(hour + '-0', '%Y-%m-%d-%H-%M') search_end_time = datetime.datetime.strptime(hour + '-59', '%Y-%m-%d-%H-%M') self.handle_pages(url, count, self.movie.id, search_start_time, search_end_time) end_time = end_time - datetime.timedelta(hours=1) if start_time > end_time: break time.sleep(SEARCH_SLEEP_TIME) def get_page_count(self, content): max_count = 0 result = re.findall(r'&page=\d+', content) with file('ssssssss','w') as f: f.write(content) print 'file ssssssss save ok..~!' print 'page ----------------------->:',result for index in result: info = index.split('=') if int(info[1]) > max_count: max_count = int(info[1]) print '当前搜索结果页数:', max_count return max_count def is_rebot(self, content): content = self.format_content(content) result = re.findall(r'我真滴不是机器人', content) if len(result) > 0: send_mail('机器人', '快填验证码!') return True else: return False def handle_pages(self, url_head, page_count, movie_id, start_time, end_time): earliest_time = False for i in xrange(1, page_count + 1): page = str(i) url = url_head +"&page=" + page print url isreboot = False isear = False while True: response = self.session.get(url) content = response.content isreboot = self.is_rebot(content) if isreboot: print '变机器人了,需要帮助!!' time.sleep(REBOT_SLEEP_TIME) continue earliest_time = self.movie.decode_content(content, movie_id, start_time, end_time) if not earliest_time: isear = True break print '现在是第',page,'/',page_count,'页' time.sleep(5) break if isear: break return earliest_time def format_content(self, content): r = content.decode('unicode_escape').encode("utf-8") return r.replace("\/", "/")