def save(): try: fs = open('record.pkl', 'wb') pickle.dump(self.startId, fs, 0) pickle.dump(self.lastQuery, fs, 0) pickle.dump(self.cache, fs, 0) pickle.dump(self.maybe, fs, 0) fs.close() except IOError: menulog.debug(u'保存缓存失败')
def getWebContent(url): try: url += '&companyId=1' req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 6.0; PRO 6 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/44.0.2403.130 Mobile Safari/537.36 YiXin/4.8.3') res = urllib2.urlopen(req) html = res.read().decode('utf-8') return html except Exception as e: menulog.debug(str(e)) return ''
def getWebContent(url): try: url += '&companyId=1' req = urllib2.Request(url) req.add_header( 'User-Agent', 'Mozilla/5.0 (Linux; Android 6.0; PRO 6 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/44.0.2403.130 Mobile Safari/537.36 YiXin/4.8.3' ) res = urllib2.urlopen(req) html = res.read().decode('utf-8') return html except Exception as e: menulog.debug(str(e)) return ''
def addOne(page=1): """访问计数""" try: if not s: globals()['s'] = shelve.open('visit_count.dat', writeback=True) if page == 0: s['count_home'] = 0 if s.get( 'count_home') is None else s['count_home'] + 1 elif page == 1: s['count_menu'] = 0 if s.get( 'count_menu') is None else s['count_menu'] + 1 s.sync() except Exception as e: menulog.debug(e)
def getWebContent(url): try: fname = url.split('?')[1].replace('=', '_') if cache.get(fname): return cache.get(fname) else: req = urllib2.Request(url+ '&companyId=1') # update:增加了这个参数 req.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 6.0; PRO 6 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/44.0.2403.130 Mobile Safari/537.36 YiXin/4.8.3') res = urllib2.urlopen(req) html = res.read().decode('utf-8') saveCache(fname, html) return html except Exception as e: menulog.debug(str(e)) return ''
def getWebContent(url): try: fname = url.split('?')[1].replace('=', '_') if cache.get(fname): return cache.get(fname) else: req = urllib2.Request(url + '&companyId=1') # update:增加了这个参数 req.add_header( 'User-Agent', 'Mozilla/5.0 (Linux; Android 6.0; PRO 6 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/44.0.2403.130 Mobile Safari/537.36 YiXin/4.8.3' ) res = urllib2.urlopen(req) html = res.read().decode('utf-8') saveCache(fname, html) return html except Exception as e: menulog.debug(str(e)) return ''
def getWeekDayFromDay(daytime): """根据日期(如20160517)计算是星期几""" try: daytime = '20'+ str(daytime) # '20160517' year = int(daytime[:4]) # 2016 month = int(daytime[4:6]) # 5 day = int(daytime[6:8]) # 17 weekday = datetime(year, month, day, 0, 0, 0, 0).weekday() weekdaynames= { 0: u'星期一', 1: u'星期二', 2: u'星期三', 3: u'星期四', 4: u'星期五', 5: u'星期六', 6: u'星期日', } return weekdaynames.get(weekday, u'') except: menulog.debug(u'获取星期几错误') return u''
def getWeekDayFromDay(daytime): """根据日期(如20160517)计算是星期几""" try: daytime = '20' + str(daytime) # '20160517' year = int(daytime[:4]) # 2016 month = int(daytime[4:6]) # 5 day = int(daytime[6:8]) # 17 weekday = datetime(year, month, day, 0, 0, 0, 0).weekday() weekdaynames = { 0: u'星期一', 1: u'星期二', 2: u'星期三', 3: u'星期四', 4: u'星期五', 5: u'星期六', 6: u'星期日', } return weekdaynames.get(weekday, u'') except: menulog.debug(u'获取星期几错误') return u''
def process(self): self.count += 1 self.today = int(time.strftime('%y%m%d', time.localtime())) menulog.info(u'开始第%d次查找@%d' % (self.count, self.getTime())) try: db = dbm.open(datafile, 'c') if not len(db): # 没有之前的数据文件 db['startId'] = str(startId) db['lastQuery'] = str(self.getTime()) db['cache'] = str(self.cache) db['maybe'] = str(self.maybe) self.startId = eval(db['startId']) - self.back self.cache = eval(db['cache']) self.maybe = eval(db['maybe']) self.nowId = self.startId self.lastQuery = self.getTime() # 保存最后搜索时间 while self.nowId - self.startId < self.interval: menulog.info(u'开始查找: %d' % self.nowId) text = getWebContent(urlhead + str(self.nowId)) if text.find(u'今日菜单') != -1 and text.find(u'本帮菜') != -1: self.empty = 0 try: year = re.findall(pattern_year, text)[0] monthday = re.findall(pattern_month, text) if monthday[0] == '0' and len(monthday) > 2: month = monthday[0] + monthday[1] dayIndex = 2 else: month = monthday[0] dayIndex = 1 if len(monthday) > dayIndex: day = monthday[dayIndex] if len(day) == 1: # 针对 1</span>...>5日  # 上面的月份也有这种情况 day += re.findall(pattern_day2, text)[0] else: day = re.findall(pattern_day, text)[0] update_month = re.findall(pattern_month_update, text)[0] # 发布菜单的月份,用于跨年 if int(update_month) == 12 and int(month) == 1: year = str(int(year) + 1) thisday = int(year + month + day) self.startId = self.nowId if self.cache.has_key(thisday): menulog.info(u'更新%s的菜单id为%s' % (thisday, self.nowId)) self.cache[thisday] = self.nowId menulog.info('find %d' % self.nowId) except (IndexError, ): if self.nowId not in self.maybe: self.maybe.append(self.nowId) menulog.debug('IndexError add maybe') else: if text.find(u'请求素材不存在') == -1: # 搜索到的结果页有内容(不是菜单) self.usedId = self.nowId self.empty = 0 else: self.empty += 1 menulog.info('empty(%d) %d' % (self.empty, self.nowId)) if self.empty > self.maxEmpty: menulog.debug('break this round') break self.nowId += 1 # if self.maybe and max(self.maybe) > max(self.cache.values()): # # 取消这个设计, 格式变化太大, 很可能导致卡住 # menulog.info(u'更新起点至可能的ID:%d'% max(self.maybe)) # self.startId = max(self.maybe) if self.usedId > self.startId: menulog.info(u'更新起点至%d' % self.usedId) self.startId = self.usedId # 保存 db['startId'] = str(self.startId) db['lastQuery'] = str(self.lastQuery) db['cache'] = str(self.cache) db['maybe'] = str(self.maybe) menulog.info(u'第%d次查找结束' % self.count) # 已更新的菜单 self.cache = eval(db['cache']) future = [] for day in self.cache.keys(): if day >= self.today: future.append(day) future.sort() db['future'] = str(future) menulog.info(u'更新今后已找到的菜单列表') db.close() except (IOError, EOFError): menulog.info(u'缓存读取/创建异常') finally: self.running = False
def process(self): self.count += 1 self.today = int(time.strftime('%y%m%d', time.localtime())) menulog.info(u'开始第%d次查找@%d'% (self.count, self.getTime())) try: db = dbm.open(datafile, 'c') if not len(db): # 没有之前的数据文件 db['startId'] = str(startId) db['lastQuery'] = str(self.getTime()) db['cache'] = str(self.cache) db['maybe'] = str(self.maybe) self.startId = eval(db['startId']) - self.back self.cache = eval(db['cache']) self.maybe = eval(db['maybe']) self.nowId = self.startId self.lastQuery = self.getTime() # 保存最后搜索时间 while self.nowId - self.startId < self.interval: menulog.info(u'开始查找: %d'% self.nowId) text = getWebContent(urlhead+ str(self.nowId)) if text.find(u'今日菜单') != -1: self.empty = 0 try: year = re.findall(pattern_year, text)[0] monthday = re.findall(pattern_month, text) if monthday[0] == '0' and len(monthday)> 2: month = monthday[0]+monthday[1] dayIndex = 2 else: month = monthday[0] dayIndex = 1 if len(monthday) > dayIndex: day = monthday[dayIndex] if len(day) == 1: # 针对 1</span>...>5日  # 上面的月份也有这种情况 day += re.findall(pattern_day2, text)[0] else: day = re.findall(pattern_day, text)[0] update_month = re.findall(pattern_month_update, text)[0] # 发布菜单的月份,用于跨年 if int(update_month) == 12 and int(month) == 1: year = str(int(year)+1) thisday = int(year+month+day) self.startId = self.nowId if self.cache.has_key(thisday): menulog.info(u'更新%s的菜单id为%s'% (thisday, self.nowId)) self.cache[thisday] = self.nowId menulog.info('find %d'% self.nowId) except (IndexError, ): if text.find(u'祝您用餐愉快') and text.find(u'农历'): menulog.debug('gz menu') elif self.nowId not in self.maybe: self.maybe.append(self.nowId) menulog.debug('IndexError add maybe') else: if text.find(u'请求素材不存在') == -1: # 搜索到的结果页有内容(不是菜单) self.usedId = self.nowId self.empty = 0 else: self.empty += 1 if self.empty > 10: menulog.debug('break this round') break self.nowId += 1 # if self.maybe and max(self.maybe) > max(self.cache.values()): # # 取消这个设计, 格式变化太大, 很可能导致卡住 # menulog.info(u'更新起点至可能的ID:%d'% max(self.maybe)) # self.startId = max(self.maybe) if self.usedId > self.startId: menulog.info(u'更新起点至%d'% self.usedId) self.startId = self.usedId # 保存 db['startId'] = str(self.startId) db['lastQuery'] = str(self.lastQuery) db['cache'] = str(self.cache) db['maybe'] = str(self.maybe) menulog.info(u'第%d次查找结束'% self.count) # 已更新的菜单 self.cache = eval(db['cache']) future = [] for day in self.cache.keys(): if day >= self.today: future.append(day) future.sort() db['future'] = str(future) menulog.info(u'更新今后已找到的菜单列表') db.close() except (IOError, EOFError): menulog.info(u'缓存读取/创建异常') finally: self.running = False
def process(self): def save(): try: fs = open('record.pkl', 'wb') pickle.dump(self.startId, fs, 0) pickle.dump(self.lastQuery, fs, 0) pickle.dump(self.cache, fs, 0) pickle.dump(self.maybe, fs, 0) fs.close() except IOError: menulog.debug(u'保存缓存失败') self.count += 1 menulog.info(u'开始第%d次查找@%d' % (self.count, self.getTime())) try: f = file('record.pkl', 'rb') self.startId = pickle.load(f) self.nowId = self.startId self.lastQuery = pickle.load(f) # 注意这里会覆盖为原来的值 self.cache = pickle.load(f) self.maybe = pickle.load(f) f.close() except (IOError, EOFError): # 没有缓存文件 或 文件内容格式不对 menulog.info(u'缓存读取异常, 重建') save() self.lastQuery = self.getTime() # 重新覆盖为现在的时间 while self.nowId - self.startId < self.interval: menulog.info(u'开始查找: %d' % self.nowId) page = urllib.urlopen(urlhead + str(self.nowId)) text = page.read().decode('utf-8') if text.find(u'今日菜单') != -1: try: year = re.findall(pattern_year, text)[0] month = re.findall(pattern_month, text)[0] day = re.findall(pattern_day, text)[0] thisday = int(year + month + day) self.startId = self.nowId self.cache[thisday] = self.nowId menulog.info('find %d' % self.nowId) except (IndexError, ): if text.find(u'风味小吃') != -1: # 抓到了广州的菜单 pass else: if self.nowId not in self.maybe: self.maybe.append(self.nowId) menulog.debug('IndexError') else: if text.find(u'请求素材不存在') == -1: # 搜索到的结果页有内容(不是菜单) self.usedId = self.nowId self.nowId += 1 if self.maybe and max(self.maybe) > max(self.cache.values()): # 例如先更新了15956但是样式错误, 然后用过的id更新至16xxx, 最后又把15958替换成了正确的菜单 menulog.info(u'更新起点至可能的ID:%d' % max(self.maybe)) self.startId = max(self.maybe) elif self.usedId > self.startId: menulog.info(u'更新起点至%d' % self.usedId) self.startId = self.usedId menulog.info(u'第%d次查找结束' % self.count) save() self.running = False
def process(self): def save(): try: fs = open('record.pkl', 'wb') pickle.dump(self.startId, fs, 0) pickle.dump(self.lastQuery, fs, 0) pickle.dump(self.cache, fs, 0) pickle.dump(self.maybe, fs, 0) fs.close() except IOError: menulog.debug(u'保存缓存失败') self.count += 1 menulog.info(u'开始第%d次查找@%d'% (self.count, self.getTime())) try: f = file('record.pkl', 'rb') self.startId = pickle.load(f) self.nowId = self.startId self.lastQuery = pickle.load(f) # 注意这里会覆盖为原来的值 self.cache = pickle.load(f) self.maybe = pickle.load(f) f.close() except (IOError, EOFError): # 没有缓存文件 或 文件内容格式不对 menulog.info(u'缓存读取异常, 重建') save() self.lastQuery = self.getTime() # 重新覆盖为现在的时间 while self.nowId - self.startId < self.interval: menulog.info(u'开始查找: %d'% self.nowId) page = urllib.urlopen(urlhead+ str(self.nowId)) text = page.read().decode('utf-8') if text.find(u'今日菜单') != -1: try: year = re.findall(pattern_year, text)[0] month = re.findall(pattern_month, text)[0] day = re.findall(pattern_day, text)[0] thisday = int(year+month+day) self.startId = self.nowId self.cache[thisday] = self.nowId menulog.info('find %d'% self.nowId) except (IndexError, ): if text.find(u'风味小吃') != -1: # 抓到了广州的菜单 pass else: if self.nowId not in self.maybe: self.maybe.append(self.nowId) menulog.debug('IndexError') else: if text.find(u'请求素材不存在') == -1: # 搜索到的结果页有内容(不是菜单) self.usedId = self.nowId self.nowId += 1 if self.maybe and max(self.maybe) > max(self.cache.values()): # 例如先更新了15956但是样式错误, 然后用过的id更新至16xxx, 最后又把15958替换成了正确的菜单 menulog.info(u'更新起点至可能的ID:%d'% max(self.maybe)) self.startId = max(self.maybe) elif self.usedId > self.startId: menulog.info(u'更新起点至%d'% self.usedId) self.startId = self.usedId menulog.info(u'第%d次查找结束'% self.count) save() self.running = False