async def scrape(self) -> [Plan]: self.session = Session() start = 1 end = self.getLast() + 1 plans = await asyncio.gather(*[self.job(x) for x in range(start, end)]) return list(filter(None, plans))
class AuthHandler(BaseHandler): def __init__(self, application, request, **kwargs): super(AuthHandler, self).__init__(application, request, **kwargs) self.cache = self.settings['cache'] self.session = Session(self.settings['session_manager'], self) self.save_userid() def get_current_user(self): return self.session.get('userid') def save_userid(self, userid=None): if userid: self.userid = int(userid) self.session['userid'] = self.userid self.session.save() else: self.userid = self.session.get('userid') or 0 if options.debug: self.set_cookie("uid", str(self.userid), expires_days=30, httponly=True) else: self.set_cookie("uid", str(self.userid), expires_days=30, httponly=True, secure=True)
class LoginHandler(BaseHandler): def post(self, *args, **kwargs): userName = self.json_args.get("username") passWord = self.json_args.get("password") + config.sha_salt sha256 = hashlib.sha256() sha256.update(passWord.encode(encoding='utf-8')) passWordEncryption = sha256.hexdigest().lower() userInfos = self.redis.get(userName) login = False identity = 3 if userInfos: #redis中有该用户 userData = eval(self.redis.get(userName)) if userData.get( 'ui_salt' ) == passWordEncryption: #判断redis中密码哈希值和传入的密码哈希值是否一致 login = True user_name = userData.get('ui_name') identity = userData.get('ui_identity') try: self.session = Session(self) self.session.data['id'] = userData.get('ui_id') self.session.data['name'] = userData.get('ui_name') self.session.data['identity'] = userData.get('ui_identity') self.session.save() except Exception as e: logging.error(e) else: #redis中没有该用户,转到mysql查询 userData = self.db.query_one(whereFieldValue="ui_number = '" + userName + "'", table='tab_user_infos') if userData: if userData[0]['ui_salt'] == passWordEncryption: #登陆成功 print('登陆成功') login = True identity = userData[0].get('ui_identity') user_name = userData[0].get('ui_name') try: self.session = Session(self) self.session.data['id'] = userData[0].get('ui_id') self.session.data['name'] = userData[0].get('ui_name') self.session.data['identity'] = userData[0].get( 'ui_identity') self.session.save() except Exception as e: logging.error(e) else: login = False result = str(userData[0]) self.redis.set(userName, result) else: #账户或密码错误 login = False if login: content = {'msg': '登陆成功', 'name': user_name, 'identity': identity} retData = self._returnData(True, content) else: retData = self._returnData(False, 'error_login') self.write(retData)
class LoginHandler(BaseHandler): def post(self, *args, **kwargs): userName = self.json_args.get("username") passWord = self.json_args.get("password") + config.sha_salt sha256 = hashlib.sha256() sha256.update(passWord.encode(encoding='utf-8')) passWordEncryption = sha256.hexdigest().lower() userInfos = self.redis.get(userName) login = False identity = 3 if userInfos: #redis中有该用户 userData = eval(self.redis.get(userName)) if userData.get('ui_salt') == passWordEncryption: #判断redis中密码哈希值和传入的密码哈希值是否一致 login = True user_name = userData.get('ui_name') identity = userData.get('ui_identity') try: self.session = Session(self) self.session.data['id'] =userData.get('ui_id') self.session.data['name'] = userData.get('ui_name') self.session.data['identity'] = userData.get('ui_identity') self.session.save() except Exception as e: logging.error(e) else: #redis中没有该用户,转到mysql查询 userData = self.db.query_one(whereFieldValue="ui_number = '" + userName + "'", table='tab_user_infos') if userData: if userData[0]['ui_salt'] == passWordEncryption: #登陆成功 print('登陆成功') login = True identity = userData[0].get('ui_identity') user_name = userData[0].get('ui_name') try: self.session = Session(self) self.session.data['id'] = userData[0].get('ui_id') self.session.data['name'] = userData[0].get('ui_name') self.session.data['identity'] = userData[0].get('ui_identity') self.session.save() except Exception as e: logging.error(e) else: login = False result = str(userData[0]) self.redis.set(userName,result) else: #账户或密码错误 login = False if login: content = { 'msg':'登陆成功', 'name':user_name, 'identity':identity } retData = self._returnData(True,content) else: retData = self._returnData(False, 'error_login') self.write(retData)
def __init__(self, *argc, **argkw): super(BaseHandler, self).__init__(*argc, **argkw) self.session = Session(self.application.session_manager, self) # Reuse the redis connection for session manager as a cache store self.cache_server = self.application.session_manager.redis self.conn = None self.conn_slave = None self.current_user_profile = None self.app_id = self.request.host.split(':')[0].lower() # if self.app_id not in DB_POOL: self.app_id = app_settings['app_id'] # logging.info('APP:%s' % self.app_id) # if self.app_id and self.app_id in DB_POOL: self.db_pool = ConnQueue(settings['apps'][0]) # DB_POOL[self.app_id] # else: # self.db_pool = None # logging.info(self.db_pool) self.config = app_settings
def start_scrapy(self): session = Session() res = session.request("GET", self.url) if res is None: return html = res.content if self.end == None: self.end = get_last_page(html) for page in range(self.start, self.end): new_url = self.url + '?page=' + str(page) res = session.request("GET", new_url) if res is None: return html = res.content logger.debug('page: ' + str(page) + " res.status:" + str(res)) for book in BeautifulSoup(html, 'html.parser').select('div.gallery'): d = self.get_book_info(book) Book(d['info_page_url'], log_option={ 'result_page': page }).download()
def __init__(self, info_page_url, log_option=dict): self.info_page_url = info_page_url res = Session().request("GET", self.info_page_url) if res is None: return self.soup = BeautifulSoup(res.content, 'html.parser') self.title = None self.log_option = log_option self.sub_file_name = None self.gid = None self.token = None self.tumb_url = None self.max_page = 0 self.download_path = None self.init_from_net()
async def fetch(page): url = '{}/galleries/{}/{}.{}'.format(GALLERY_PATH, self.gid, str(page), self.sub_file_name) title = get_pic_name(url) if downloaded_img(title): return res = await asyncio.get_event_loop().run_in_executor( None, functools.partial(Session().request, method="GET", url=url, title=self.title)) if res is None: pass else: open(os.path.join(self.download_path, title), 'wb').write(res.content) logger.info("got:{},({})".format(url, self.max_page))
class BaseHandler(tornado.web.RequestHandler): def __init__(self, *argc, **argkw): super(BaseHandler, self).__init__(*argc, **argkw) self.session = Session(self.application.session_manager, self) # Reuse the redis connection for session manager as a cache store self.cache_server = self.application.session_manager.redis self.conn = None self.conn_slave = None self.current_user_profile = None self.app_id = self.request.host.split(':')[0].lower() # if self.app_id not in DB_POOL: self.app_id = app_settings['app_id'] # logging.info('APP:%s' % self.app_id) # if self.app_id and self.app_id in DB_POOL: self.db_pool = ConnQueue(settings['apps'][0]) # DB_POOL[self.app_id] # else: # self.db_pool = None # logging.info(self.db_pool) self.config = app_settings def is_wechat(self): return self.request.headers['User-Agent'].find('MicroMessenger') > 0 def is_auth(self): return self.current_user_profile def is_admin(self): return self.current_user_profile def is_member(self): return self.current_user_profile and self.current_user_profile['member'] # 计算签名 def createSign(self, paramMap, secret): codec = '' for key in sorted(paramMap.iterkeys()): codec += ('%s=%s&' % (key, paramMap[key])) codec += "key=%s" % secret sign = (hashlib.md5(codec).hexdigest()).upper() return sign def arrayToXml(self, arr): """array转xml""" xml = ["<xml>"] for k, v in arr.iteritems(): if v.isdigit(): xml.append("<{0}>{1}</{0}>".format(k, v)) else: xml.append("<{0}><![CDATA[{1}]]></{0}>".format(k, v)) xml.append("</xml>") return "".join(xml) def xmlToArray(self, xmldata): """将xml转为array""" array_data = {} root = xml.etree.ElementTree.fromstring(xmldata) for child in root: value = child.text array_data[child.tag] = value return array_data @tornado.gen.coroutine def getsignkey(self, m_id, uuid, m_key): sandbox = "https://api.mch.weixin.qq.com/sandboxnew/pay/getsignkey" prepare = {"mch_id": m_id, "nonce_str": uuid} prepare["sign"] = self.createSign(prepare, m_key) xmlbody = self.arrayToXml(prepare) # logging.info(xmlbody) client = tornado.httpclient.AsyncHTTPClient() request = tornado.httpclient.HTTPRequest(url=sandbox, method="POST", validate_cert=False, headers='', body=self.arrayToXml(prepare)) resp = yield client.fetch(request) # logging.info(resp.body) raise tornado.web.gen.Return(self.xmlToArray(resp.body)) #merge peron using cellphone @tornado.gen.coroutine def merge_person(self, cellphone=None, auth_id=None, email=None, corp_open_id=None): if cellphone and auth_id: yield self.merge_person_cellphone(cellphone, auth_id) if email and auth_id: yield self.merge_person_email(email, auth_id) if cellphone and corp_open_id: auth_res = yield self.fetchone_db( "select * from t_person where corp_open_id=%s", corp_open_id) person = yield self.fetchone_db( "select person_id from t_person where cellphone=%s and person_id!=%s", cellphone, auth_res['person_id']) if auth_res and person: yield self.update_db( """update t_person set corp_open_id = %s, user_id = %s, fullname = %s, auth_date = %s, gender = %s, cellphone = %s, email = %s, wechatid = %s, position = %s, avatar = %s where person_id=%s""", corp_open_id, auth_res['user_id'], auth_res['fullname'], get_now_str(), auth_res['gender'], auth_res['cellphone'], auth_res['email'], auth_res['wechatid'], auth_res['position'], auth_res['avatar'], person['person_id']) yield self.execute_db( """delete from t_person where person_id = %s """, auth_res['person_id']) if email and corp_open_id: auth_res = yield self.fetchone_db( "select * from t_person where corp_open_id=%s", corp_open_id) person = yield self.fetchone_db( "select person_id from t_person where email=%s and person_id!=%s", email, auth_res['person_id']) if auth_res and person: if auth_res['cellphone'] != person['cellphone']: cellphone = auth_res['cellphone'] cellphone1 = person['cellphone'] else: cellphone = auth_res['cellphone'] cellphone1 = '' yield self.update_db( """update t_person set corp_open_id = %s, user_id = %s, fullname = %s, auth_date = %s, gender = %s, cellphone = %s, email = %s, wechatid = %s, position = %s, avatar = %s where person_id=%s""", corp_open_id, auth_res['user_id'], auth_res['fullname'], get_now_str(), auth_res['gender'], auth_res['cellphone'], auth_res['email'], auth_res['wechatid'], auth_res['position'], auth_res['avatar'], person['person_id']) yield self.execute_db( """delete from t_person where person_id = %s """, auth_res['person_id']) @tornado.gen.coroutine def merge_person_cellphone(self, cellphone, auth_id): if cellphone and auth_id: auth_res = yield self.fetchone_db( "select * from t_person where auth_id=%s", auth_id) person = yield self.fetchone_db( "select person_id from t_person where cellphone=%s and person_id<>%s", cellphone, auth_res['person_id']) if auth_res and person: yield self.update_db( """update t_person set auth_id = %s, open_id = %s, web_open_id = %s, auth_date = %s, nick_name = %s, head_img_url = %s, gender = %s, city = %s, province = %s, country = %s where person_id=%s""", auth_id, auth_res['open_id'], auth_res['web_open_id'], get_now_str(), auth_res['nick_name'], auth_res['head_img_url'], auth_res['gender'], auth_res['city'], auth_res['province'], auth_res['country'], person['person_id']) yield self.execute_db( """delete from t_person where person_id = %s """, auth_res['person_id']) auth_res = yield self.fetchone_db( "select * from t_person where auth_id=%s", auth_id) if auth_res and auth_res['email']: yield self.merge_person_email(auth_res['email'], auth_id) #merge peron using email @tornado.gen.coroutine def merge_person_email(self, email, auth_id): auth_res = yield self.fetchone_db( "select * from t_person where auth_id=%s", auth_id) person = yield self.fetchone_db( "select * from t_person where email=%s and person_id!=%s", email, auth_res['person_id']) if auth_res and person: if auth_res['cellphone'] != person['cellphone']: cellphone = auth_res['cellphone'] cellphone1 = person['cellphone'] else: cellphone = auth_res['cellphone'] cellphone1 = '' yield self.update_db( """update t_person set auth_id = %s, open_id = %s, web_open_id = %s, auth_date = %s, nick_name = %s, head_img_url = %s, gender = %s, city = %s, province = %s, country = %s, cellphone = %s, cellphone1 = %s where person_id= %s""", auth_id, auth_res['open_id'], auth_res['web_open_id'], get_now_str(), auth_res['nick_name'], auth_res['head_img_url'], auth_res['gender'], auth_res['city'], auth_res['province'], auth_res['country'], cellphone, cellphone1, person['person_id']) yield self.execute_db( """delete from t_person where person_id = %s""", auth_res['person_id']) @tornado.gen.coroutine def refresh_user_profile(self): user_id = self.current_user handler = self if not user_id or handler.get_cache( "user-profile-refreshed-%s" % user_id): return profile = {} profile['authorized'] = yield handler.fetchone_db( "select * from t_person where auth_id=%s", user_id) profile['member'] = yield handler.fetchone_db( "select t.* , c.code_name as member_type_name from t_member as t left join t_codes as c on c.code_id = t.member_type where t.auth_id=%s", user_id) profile['person'] = yield handler.fetchone_db( "select a.* from t_person as a where a.auth_id=%s", user_id) profile['org'] = yield handler.fetchone_db( "select a.* from t_org as a, t_person as b, t_org_person as c where b.auth_id=%s and c.person_id = b.person_id and c.org_id = a.org_id", user_id) profile['roles'] = yield handler.query_db( "select a.* from t_auth_role as a where a.auth_id=%s", user_id) handler.set_cache("user-profile-" + user_id, dump_json(profile), 86400 * 300) handler.set_secure_cookie("user", user_id, expires_days=300) handler.set_cache("user-profile-refreshed-%s" % user_id, '1', 60) def escape(self, obj): ''' Escape whatever value you pass to it ''' if isinstance(obj, str_type): return "'" + escape_string(obj) + "'" return escape_item(obj, 'utf-8') def escape_string(self, s): return tornado.escape.xhtml_escape(escape_string(s)) def limit_action(self, key, max_times=200): res = self.session.get(key) or 0 if res and int(res) > max_times: return True self.session.set(key, int(res) + 1) return False @tornado.gen.coroutine def add_traffic(self, key, t): yield self.insert_db( "insert into ac_traffic (item_id, item_type, created, ip) values (%s, %s, now(), %s)", key, t, self.request.remote_ip) def set_cache_obj(self, key, value, timeout=60): self.set_cache(key, dump_json(value), timeout) def set_cache(self, key, value, timeout=60): if len(key) > 200: key = genenate_file_key(key) self.cache_server.setex(self.config['app_id'] + '-' + key, timeout, value) def get_cache_obj(self, key): r = self.get_cache(key) if r: return json.loads(r) return '' def get_cache(self, key): if len(key) > 200: key = genenate_file_key(key) return self.cache_server.get(self.config['app_id'] + '-' + key) @tornado.gen.coroutine def do_task(self, func, queue, args): key = 'task-working-' + genenate_file_key(dump_json(args)) if self.get_cache(key): raise tornado.web.gen.Return(None) return res = None try: self.set_cache(key, args, 60) res = yield tornado.gen.Task(func, queue=queue, args=args) self.set_cache(key, '', 1) except: self.set_cache(key, '', 1) raise tornado.web.gen.Return(None) return raise tornado.web.gen.Return(res) def check_login(self): if not self.current_user: self.redirect("/login") # self.finish() return @tornado.gen.coroutine def redirect_unauth(self): if isinstance(self, BasePage): self.close_db_conn() self.redirect("/page/unauth") else: self.close_db_conn() yield self.response_error('access_not_allow') def get_current_user(self): user = self.get_secure_cookie("user") return user @tornado.gen.coroutine def refresh_app_config(self): k = self.app_id + '-app-config-refreshed' if not self.get_cache(k): res = yield self.query_db("select * from t_settings") settings = {} for r in res: settings[r['st_id']] = r["st_value"] self.config = settings self.set_cache(k, 1, 60) logging.info("refresh app config of %s" % self.app_id) # 刷新用户profile # yield self.refresh_user_profile() @tornado.gen.coroutine def get_db_conn(self, master=False): if self.conn and self.conn.stream and not self.conn.stream.closed(): raise tornado.web.gen.Return(self.conn) return if not self.app_id or not self.db_pool: yield self.error('网络错误,应用不存在') raise tornado.web.gen.Return(None) return if not self.db_pool.is_init: yield self.db_pool.init_conn() conn = self.db_pool.get_item() if not conn: yield self.error('网络繁忙,请稍候重试') raise tornado.web.gen.Return(None) return self.conn = conn if self.conn.stream.closed(): yield self.conn.connect() yield self.refresh_app_config() raise tornado.web.gen.Return(self.conn) @tornado.gen.coroutine def execute_batch_db(self, *args, **kwargs): conn = yield self.get_db_conn(True) try: res = yield conn.executebatch(*args, **kwargs) raise tornado.gen.Return(res) except: logging.error("Error to executebatch %s" % str(sys.exc_info())) raise tornado.gen.Return(False) @tornado.gen.coroutine def callproc_db(self, *args, **kwargs): conn = yield self.get_db_conn() res = yield conn.query(*args, **kwargs) ### fix issue yield conn.next_result() raise tornado.gen.Return(res or []) @tornado.gen.coroutine def query_db(self, *args, **kwargs): conn = yield self.get_db_conn() res = yield conn.query(*args, **kwargs) #raise tornado.web.HTTPError(500, log_message="Query Failed") raise tornado.gen.Return(res or []) @tornado.gen.coroutine def fetchone_db(self, *args, **kwargs): args = list(args) args[0] = args[0] + ' LIMIT 0,1' res = yield self.query_db(*args, **kwargs) if res and len(res) > 0: raise tornado.gen.Return(res[0]) raise tornado.gen.Return(None) @tornado.gen.coroutine def update_db(self, *args, **kwargs): conn = yield self.get_db_conn(True) res = yield conn.update(*args, **kwargs) raise tornado.gen.Return(res) @tornado.gen.coroutine def insert_db_by_obj(self, table, obj): keys = [] vals = [] for k in obj: keys.append(k) vals.append(obj[k]) sql = '''insert into %s (%s) values (%s)''' % (table, ','.join(keys), ('%s,' * len(keys))[:-1]) res = yield self.insert_db(sql, *vals) raise tornado.gen.Return(res) @tornado.gen.coroutine def update_db_by_obj(self, table, obj, condition): keys = [] vals = [] sets = [] for k in obj: keys.append(k) vals.append(obj[k]) sets.append(str(k) + '=%s') sql = '''update %s set %s''' % (table, ','.join(sets)) res = yield self.update_db(sql + " where " + condition, *vals) raise tornado.gen.Return(res) @tornado.gen.coroutine def insert_db(self, *args, **kwargs): conn = yield self.get_db_conn(True) res = yield conn.insert(*args, **kwargs) raise tornado.gen.Return(True) @tornado.gen.coroutine def insert_db_and_return_last_insert_id(self, *args, **kwargs): conn = yield self.get_db_conn(True) args_list = list(args) args_list[0] = "%s; SELECT LAST_INSERT_ID()" % args[0] n_args = tuple(args_list) res = yield conn.insert(*n_args, **kwargs) raise tornado.gen.Return(True) @tornado.gen.coroutine def execute_db(self, *args, **kwargs): conn = yield self.get_db_conn(True) res = yield conn.execute(*args, **kwargs) raise tornado.gen.Return(res) @tornado.gen.coroutine def execute_rowcount_db(self, *args, **kwargs): conn = yield self.get_db_conn() res = yield conn.execute_rowcount(*args, **kwargs) raise tornado.gen.Return(res) def get_safe_text_argument(self, key, default=None): return self.get_escaped_argument(key) def get_escaped_argument(self, key, default=None): arg = self.get_argument(key, default) if type(arg) == type(''): arg = self.escape_string(arg) return arg def get_args(self, keys): res = {} if type(keys) == type([]): for k in keys: res[k] = self.get_escaped_argument(k) else: for k in keys: res[k] = self.get_escaped_argument( k) if keys[k] == '*' else self.get_escaped_argument( k, keys[k]) if keys[k] == '*' and not res[k]: return None return res @tornado.gen.coroutine def response_as_json(self, res): self.set_header("Content-Type", 'text/html; charset="utf-8"') self.write(dump_json(res)) self.finish() @tornado.gen.coroutine def restful(self, res): yield self.response_as_json(res) @tornado.gen.coroutine def success(self, txt): yield self.restful({'message': txt}) @tornado.gen.coroutine def response_error(self, error_name, status_code=0): """ write error message :param error_name: :param status_code """ if status_code == 0: self.set_header("Content-Type", 'text/html; charset="utf-8"') e = ERROR_CODE[error_name] if error_name in ERROR_CODE else { 'error': error_name, 'error_code': '911' } res = { 'error': e['error'], 'error_code': e['error_code'], 'error_name': error_name } self.write(json.dumps(res)) #self.finish() raise tornado.web.Finish() if settings['debug']: logging.info('response_error%s' % res) else: raise tornado.web.HTTPError(status_code=status_code, log_message=error_name) if settings['debug']: logging.info('response_error', status_code, error_name) @tornado.gen.coroutine def error(self, error_name, status_code=0): yield self.response_error(error_name, status_code) # @tornado.gen.coroutine def close_db_conn(self): #释放master if self.db_pool and self.conn: self.db_pool.put_item(self.conn) @tornado.gen.coroutine def get_current_user_by_order(self, order_id): if order_id: order = yield self.fetchone_db( "select user_id from order_info where order_id = %s", order_id) if order: user_id = order.user_id raise tornado.gen.Return(user_id) raise tornado.gen.Return('') @property def current_user(self): """ get current adminstrator :return: manager_code """ return self.get_current_user() @property def current_mgr(self): """ get current adminstrator :return: manager_code """ return self.get_secure_cookie("mgr") def on_finish(self): self.close_db_conn() def error_process(self, message): response = json.loads(message) error_code = response.get('errcode') if error_code and error_code == 40001: wechat_sign.Sign(self.config['wechat_appid'], self.config['wechat_secret'], self.config['site_host_url'], self).clearCache() return error_code def get_person_args(self): person_args = self.get_args({ 'fullname': '', 'gender': '', 'birthday': '', 'school': '', 'school_start': '', 'education': '', 'wechatid': '', 'cellphone': '*', 'cellphone1': '', 'cellphone2': '', 'email': '', 'email1': '', 'email2': '', 'address': '', 'school_number': '', 'school_department': '', 'person_info': '', 'expects': '', 'wills': '' }) return person_args @coroutine def member_type_list(self, advanced=False, normal=False, org=False, person=False, code_name=False): ''' 返回所有按照条件查询的会员类型,可选择返回code_id或者code_name,默认返回code_id :param advanced: 理事会员 :param normal: 普通会员 :param org: 企业会员 :param person: 个人会员 :param code_name: 会员中文名称 :return: ''' member_types = list() if advanced: member_types = yield self.query_db( "select code_id,code_name from t_codes where code_id like '%%advanced%%' and code_type = 'member_type'" ) elif normal: member_types = yield self.query_db( "select code_id,code_name from t_codes where code_id not like '%%advanced%%' and code_type = 'member_type'" ) elif org: member_types = yield self.query_db( "select code_id,code_name from t_codes where code_id like '%%org%%' and code_type = 'member_type'" ) elif person: member_types = yield self.query_db( "select code_id,code_name from t_codes where code_id not like '%%org%%' and code_type = 'member_type'" ) else: member_types = yield self.query_db( "select code_id,code_name from t_codes where code_type = 'member_type'" ) if code_name: raise tornado.gen.Return([i.code_name for i in member_types]) else: raise tornado.gen.Return([i.code_id for i in member_types])
def __init__(self, application, request, **kwargs): super(AuthHandler, self).__init__(application, request, **kwargs) self.cache = self.settings['cache'] self.session = Session(self.settings['session_manager'], self) self.save_userid()
# loader = self.app.jinja_loader # if loader is not None: # yield loader, template app = Flask(__name__, static_folder='static', static_path='/static', template_folder='template') app.config.from_object(__name__) app.secret_key = 'super secret key' app.config['SESSION_TYPE'] = 'redis' app.config['SESSION_PERMANENT'] = False # app.config['SESSION_COOKIE_HTTPONLY'] = False app.permanent_session_lifetime = timedelta(days=1) Session(app) app.jinja_options = Flask.jinja_options.copy() # app.jinja_options['loader'] = LeafinlineLoader(app) app.register_blueprint(running, url_prefix='/gds/api/running') app.register_blueprint(task, url_prefix='/gds/api/task') app.register_blueprint(script, url_prefix='/gds/api/script') app.register_blueprint(creator, url_prefix='/gds/api/creator') app.register_blueprint(api, url_prefix='/gdc/api') @app.route('/', methods=['GET']) def index(): return render_template('index.html', appname=g.appname)
class ArteScraper(Scraper): async def scrape(self) -> [Plan]: self.session = Session() start = 1 end = self.getLast() + 1 plans = await asyncio.gather(*[self.job(x) for x in range(start, end)]) return list(filter(None, plans)) async def job(self, id): url = "https://ed.arte.gov.tw/ch/content/m_design_content.aspx?AE_SNID=" + str( id) res = self.session.request("GET", url) if res is None: logger.warning(url + " is None") return None soup = BeautifulSoup(res.content, "html.parser") title = soup.select_one( "div.title_wrapper h3.animate.title-c1.title_icon_news").text if len(title) == 0: logger.warning(url + " is None because not find title") return None logger.info("GET data: " + url) p = self.parser(soup, title, url) return p def parser(self, soup, title, url): tags = [] grades = [] writers = [] subjects = [] for tag in soup.select( "div.author-date div.column.one-second.column_column span.f_c3.title_icon_chevron.m_left_10" ): tag = tag.text.split(":") for content in tag[1].split("."): if len(content) == 0: continue content = content.strip() if tag[0] == "教學設計者": writers.append(content) elif tag[0] == "適用對象": grades.append(self.audience_parser(content)) elif tag[0] == "學習領域": subjects.append(content) else: tags.append(tag[0].strip() + ":" + content) content = "" for section in soup.select( "div.entry-content div.the_content_wrapper"): for section2 in section.contents: # content += html2text(str(section2)) content += str(section2) formats = set() for a in soup.select( "div.column.one.author-box div.desc-wrapper div.desc a"): formats.add( self._get_format_from_extension( os.path.splitext(a["href"])[1])) return Plan( id=self._hash_id(url), origin_id=self.origin_id, title=title, writers=writers, tags=list(tags), page=url, grades=grades, subjects=subjects, formats=list(formats), # description=content ) def audience_parser(self, content): return { "高中三年級": 12, "高中二年級": 11, "高中一年級": 10, "國中三年級": 9, "國中二年級": 8, "國中一年級": 7, "小學六年級": 6, "小學五年級": 5, "小學四年級": 4, "小學三年級": 3, "小學二年級": 2, "小學一年級": 1, }[content] def getLast(self): l = [] for i in [1, 2, 3]: url = "https://ed.arte.gov.tw/ch/content/m_design_list_%d.aspx" % i soup = BeautifulSoup( self.session.request("GET", url).content, "html.parser") href = soup.select_one( ".post-title .entry-title-c1.title_icon_news a")["href"] l.append( int(href.replace("m_design_content_%d.aspx?AE_SNID=" % i, ""))) return max(l)
class SportsboxScraper(Scraper): async def scrape(self) -> [Plan]: self.session = Session() start = 1 end = self.getLast() + 1 plans = await asyncio.gather(*[self.job(x) for x in range(start, end)]) return list(filter(None, plans)) async def job(self, id): url = "https://sportsbox.sa.gov.tw/material/detail/" + str(id) res = self.session.request("GET", url) if res is None: logger.warning(url + " is None") return None soup = BeautifulSoup(res.content, "html.parser") title = soup.select_one("div.article_titleBox div.h4") if title is None: logger.warning(url + " is None because not find title") return None logger.info("GET data: " + url) p = self.parser(soup, title.text, url) return p def parser(self, soup, title, url) -> Plan: fileInContent = True grades = [] tags = [] formats = set() for infoRow in soup.select( "div.right_dataBox.box_shadow.b_radius div.row.infoRow div.col-12" ): # tags, grades for tagEle in infoRow.select("div.article_tag.rounded"): try: grades += self.audience_parser(tagEle.text) except KeyError as e: pass else: tags.append("運動:" + tagEle.text) # formats # 舊版網頁會將formats放在article_box eles = infoRow.select( "div.row.no-gutters.article_box_file div.col.file_name a") for a in eles: if fileInContent: fileInContent = False formats.add( self._get_format_from_extension( os.path.splitext(a.text)[1])) # tags, writers # 舊版網頁會有些tags放在editBox writers = set() # https://stackoverflow.com/questions/4188933/how-do-i-select-the-innermost-element for p in soup.select("div.editBox p"): if not fileInContent: key = None for strong in p.select("strong"): # id 1~223 if ":" in strong.text: arr = strong.text.split(":") key = arr[0].replace("\u3000", "") if key in ["作者", "姓名"]: v = arr[1].strip() if v.endswith("、"): writers.add(v[:-1]) else: writers = writers.union(set(v.split("、"))) elif key in ["獎項", "教案名稱"]: tags.append(key + ":" + arr[1]) elif key in ["作者", "姓名"]: writers.add(strong.text.replace("、", "")) # 223~229 作者藏在內文 # 231~301 file In Content # TODO # else: # https://sportsbox.sa.gov.tw/material/detail/231 # print("file In Content") # after=p.select_one("span strong a").next_siblings img = "" content = soup.select_one("div.article_contentBox div.editBox").text # for section in soup.select_one("div.d-flex.justify-content-end.border-bottom.mb-3").next_siblings: # # print(str(section)) # if type(section) is not NavigableString: # imgEle=section.select_one("img") # if imgEle is not None and len(img)==0: # img=imgEle['src'] # content += str(section) return Plan( id=self._hash_id(url), origin_id=self.origin_id, title=title, writers=list(writers), tags=list(tags), page=url, grades=grades, subjects=["體育"], formats=list(formats), description=content, img=img, ) def audience_parser(self, content): return { "1-2年級": [1, 2], "3-4年級": [3, 4], "5-6年級": [5, 6], "7-9年級": [7, 8, 9], "10-12年級": [10, 11, 12], }[content] def getLast(self): l = [] for id in [10, 11, 12, 13]: url = "https://sportsbox.sa.gov.tw/material/list/" + str(id) res = self.session.request("GET", url) if res is None: logger.warning(url + " is None") return soup = BeautifulSoup(res.content, "html.parser") href = soup.select_one( "div.itemBox.unit3_v.rounded.box_shadow.itemCard a")["href"] l.append(int(href.replace("/material/detail/", ""))) return max(l)
def get_current_user(self): self.session = Session(self) return self.session.data
class MoeScraper(Scraper): async def scrape(self) -> [Plan]: self.session = Session() start = 10314 end = self.getLast() + 1 plans = await asyncio.gather(*[self.job(x) for x in range(start, end)]) return list(filter(None, plans)) async def job(self, id) -> Plan: def func(type): url = f"https://mlearn.moe.gov.tw/{type}/PartData?key={str(id)}" soup = BeautifulSoup( self.session.request("GET", url).content, "html.parser") title = soup.select_one("div.container.mt-3 h2") if title is None: return url, soup, None return url, soup, title url, soup, title = func("TeachingPlan") if title is None: url, soup, title = func("TopicArticle") if title is None: logger.warning(url + " is None because not find title") return None logger.info("GET data: " + url) p = self.parser(soup, title.text, url) return p def parser(self, soup, title, url): tags = [] writers = [] subjects = [] formats = set() for tagEle in soup.select( "div.container.mt-3 div.d-flex.flex-column.mb-3 div"): tagStr = tagEle.text.split(":") if tagStr[0] == "科目分類": subjects.append(tagStr[1]) elif tagStr[0] == "作者": writers.append(tagStr[1]) elif tagStr[0] in ["教學指引", "教學媒體", "學習單"]: formats.add( self._get_format_from_extension( "." + tagEle.select_one("a").text.replace("檔案", "").lower())) elif tagStr[0] == "上架日期": pass else: tags.append(tagStr[0] + ":" + tagStr[1]) for tag in soup.select( "div.container.mt-3 a.badge.badge-pill.badge-info.mb-3"): tags.append("網路" + ":" + tag.text) grades = [] for tag in soup.select( "div.container.mt-3 a.badge.badge-pill.badge-success.mb-3"): try: grades += self.audience_parser(tag.text) except KeyError as e: pass content = "" img = "" for section in soup.select_one( "div.d-flex.justify-content-end.border-bottom.mb-3" ).next_siblings: if type(section) is not NavigableString: imgEle = section.select_one("img") if imgEle is not None and len(img) == 0: img = imgEle["src"] content += str(section) return Plan( id=self._hash_id(url), origin_id=self.origin_id, title=title, writers=writers, tags=list(tags), page=url, grades=grades, subjects=subjects, formats=list(formats), img=img, ) def audience_parser(self, content): return { "國小": [x for x in range(1, 7)], "國中": [x for x in range(7, 10)], "高中": [x for x in range(10, 13)], }[content] def getLast(self): url = "https://mlearn.moe.gov.tw/" soup = BeautifulSoup( self.session.request("GET", url).content, "html.parser") href = soup.select_one( ".col-lg-4.d-flex.flex-column.justify-content-between.h-100 a" )["href"] key = "/PartData?key=" return int(href[href.index(key) + len(key):])