def start_requests(self): total_count = session.query(Star).count() if total_count > 0: self.index = total_count % self.page_size + 1 self.page = math.floor(total_count / self.page_size) + 1 return [scrapy.Request(self.next_url())]
def init_start_urls(self): """ 获得遗漏的星座和日期,添加到start_urls中 """ sql = """ SELECT `date`, COUNT(*) c FROM astro_day GROUP BY `date` HAVING c < 12 ORDER BY c DESC """ # 所有的星座id astroids = set(range(1, 13)) result = engine.execute(sql).fetchall() for date, count in result: date = date.strftime('%Y-%m-%d') # 获得该日期下的所有星座id res = session.query(AstroDay.astroid).filter_by(date=date).all() astroids_date = set(map(lambda x: x[0], res)) # 求差集, 得到遗漏的星座id astroids_diff = list(astroids - astroids_date) # 同个星座id和日期构造请求链接 for astroid in astroids_diff: self.astroid = astroid - 1 self.date = date url = self.next_url() self.start_urls.append(url) res = [] for url in self.start_urls: if self.start_urls.index(url) == 0: continue yield scrapy.Request(url)
def weather_item(self, item): """ 天气数据存储 """ item['date'] = str(item['date']) date = '-'.join( [item['date'][0:4], item['date'][4:6], item['date'][6:8]]) i = 0 for tmp in item['forecast']: curr_date = date_operate(date, i) i += 1 info = { 'city': item['city'], 'date': curr_date, 'shidu': item['shidu'], 'pm25': item['pm25'], 'pm10': item['pm10'], 'quality': item['quality'], 'wendu': item['wendu'], 'sunrise': tmp['sunrise'], 'sunset': tmp['sunset'], 'high': re.sub('[^\.|\d]', '', tmp['high']), # 只保留数字部分 'low': re.sub('[^\.|\d]', '', tmp['low']), 'aqi': tmp['aqi'], 'fx': tmp['fx'], 'fl': tmp['fl'], 'stype': tmp['type'], 'notice': tmp['notice'] } if not info['pm25']: del info['pm25'] del info['pm10'] del info['aqi'] del info['quality'] weather = Weather(**info) # 先判断天气数据是否存在,如果存在则更新 # 如果不存在则新增 old_weather = session.query(Weather).filter_by( city=item['city']).filter_by(date=curr_date).first() if old_weather: session.query(Weather).filter_by( id=old_weather.id).update(info) else: session.add(weather) session.commit()
def start_requests(self): # 初始化爬取链接 # self.init_start_urls() # 查询数据库中最后日期的星座数据 lastDayAstro = session.query(AstroDay).order_by( AstroDay.date.desc()).order_by(AstroDay.astroid.desc()).first() if lastDayAstro: self.date = lastDayAstro.date.strftime("%Y-%m-%d") if lastDayAstro.astroid == 12: self.date = date_operate(self.date, 1) else: self.astroid = lastDayAstro.astroid + 1 # 查询最后一条周数据 """ lastWeekAstro = session.query(AstroWeek).order_by( AstroWeek.weekth.desc()).order_by(AstroWeek.astroid.desc()).first() if lastWeekAstro: self.weekth['date'] = lastWeekAstro.weekth self.weekth['astroid'] = lastWeekAstro.astroid """ #查询最后一条月数据 lastMonthAstro = session.query(AstroMonth).order_by( AstroMonth.date.desc()).order_by( AstroMonth.astroid.desc()).first() if lastMonthAstro: self.month['date'] = lastMonthAstro.date self.month['astroid'] = lastMonthAstro.astroid # 查询最后一条年数据 lastYearAstro = session.query(AstroYear).order_by( AstroYear.date.desc()).order_by(AstroYear.astroid.desc()).first() if lastYearAstro: self.year['date'] = lastYearAstro.date self.year['astroid'] = lastYearAstro.astroid self.astroid -= 1 start_url = self.next_url() if start_url: return [scrapy.Request(start_url)] else: self.logger.error('不存在可用的appkey') return []
def weather_data_check(provinces): """ 天气数据检查, 返回要爬取的城市索引 """ from mkspider.lib.db import session from mkspider.lib.models import Weather curr_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) # 查询当前日期最后一个城市名称 date_weather = session.query(Weather).filter_by(date=curr_date).order_by( Weather.id.desc()).first() if date_weather: city = date_weather.city if type(city) != str: city = str(city.decode('utf-8')) # 判断当前日期是否查询 city_last_weather = session.query(Weather).filter_by( city=city).order_by(Weather.date.desc()).first() if city_last_weather and str(city_last_weather.date) == date_operate( curr_date, 4): return provinces.index(city) + 1 return 0
def start_requests(self): # 获得数据库中得最后日期 lastLunar = session.query(Lunar).order_by(Lunar.year.desc()).order_by( Lunar.month.desc()).order_by(Lunar.day.desc()).first() if lastLunar: last_date = "-".join([ str(lastLunar.year), str(lastLunar.month), str(lastLunar.day) ]) self.date = date_operate(last_date, 1) return [scrapy.Request(self.next_url())]
def astro_item(self, item): """ 星座运势 数据存储 """ if item['year']: res = session.query(AstroYear).filter_by( astroid=item['astroid']).filter_by( date=item['year']['date']).first() if not res: item['year']['name'] = item['astroname'] item['year']['astroid'] = item['astroid'] astroYear = AstroYear(**item['year']) session.add(astroYear) session.commit() if item['month']: year, month = item['month']['date'].split('-') item['month']['date'] = "{}{}".format(year, str(month).zfill(2)) res = session.query(AstroMonth).filter_by( astroid=item['astroid']).filter_by( date=item['month']['date']).first() if not res: item['month']['name'] = item['astroname'] item['month']['astroid'] = item['astroid'] astroMonth = AstroMonth(**item['month']) session.add(astroMonth) session.commit() if item['week'] and False: item['week']['name'] = item['astroname'] item['week']['astroid'] = item['astroid'] item['week']['weekth'] = get_weekth_by_date(item['today']['date']) start_date, end_date = item['week']['date'].split('~') end_date = "{}-{}".format(start_date.split('-')[0], end_date) item['week']['start_date'] = start_date item['week']['end_date'] = end_date del item['week']['date'] astroWeek = AstroWeek(**item['week']) session.add(astroWeek) session.commit() if item['today']: res = session.query(AstroDay).filter_by( astroid=item['astroid']).filter_by( date=item['today']['date']).first() item['today']['name'] = item['astroname'] item['today']['astroid'] = item['astroid'] try: # 判断number属性是否是数字类型 if not str(item['today']['number']).isdigit(): item['today']['number'] = 0 astroDay = AstroDay(**item['today']) session.add(astroDay) session.commit() except Exception: pass
# -*- coding: utf-8 -*- """ 提取日历数据中节气数据放到节气字段中 """ import os, sys, json bin_dir = os.path.dirname(os.path.realpath(__file__)) root_dir = os.path.join(bin_dir, '..') sys.path.append(root_dir) from mkspider.lib.db import session from mkspider.lib.models import Lunar from mkspider.lib.common import slog lanurs = session.query(Lunar).order_by(Lunar.id.asc()).all() for item in lanurs: jieqi_data = json.loads(item.jieqi) day = str(item.day) if day in jieqi_data and not item.jieqi2: slog("DD", "[%s-%s-%s]节气:%s" % (item.year, item.month, item.day, jieqi_data[day])) session.query(Lunar).filter_by(id=item.id).update({'jieqi2': jieqi_data[day]}) session.commit()
def init_types(self): """ 初始化天气类型数据 """ types = session.query(Weather.stype, Weather.notice).group_by('stype').all() for item in types: self.types[item[0]] = item[1]