def initIframe_contentPage(website_url, content_url): """ @summary: 初始化订阅的内容页面(添加相应的js) :param website_url: 网站url地址 :param content_url: 内容url地址 :return: 页面源码 """ script_code = """ <script src="/static/Subpage/js/common.js"></script> <script src="/static/Subpage/js/filterXpath.js"></script> <script src="/static/Subpage/js/contentDiscriminate.js"></script> """ if global_Chrome: html = Spider().chromedriver(content_url) else: html = Spider().urllib(content_url) if not html: return "404" p = re.compile(r'(<head>|<head .*?>)') html = p.subn( r"\1%s" % ('<meta name="referrer" content="never">' '<link rel="stylesheet" type="text/css" href="/static/Subpage/css/sub.css">' ), html)[0] p = re.compile(r'(</body>)') extract_html = "%s<input id='sub_info' type='hidden' website_url='%s' content_url='%s'>" % ( script_code, website_url, content_url) html = p.subn(r"%s\1" % extract_html, html)[0] return html
def get(self): args = parser.parse_args() if 'page_now' not in args: return err_res(code=0, msg='no page_now') if 'page_size' not in args: return err_res(code=0, msg='no page_size') page_now = args['page_now'] page_size = args['page_size'] if page_now * page_size > 250: return err_res(code=0, msg='无更多电影') cached_movies = session.query(Movie).filter( Movie.id.between((page_now - 1) * page_size + 1, page_now * page_size)).all() if len(cached_movies): return success_res(code=1000, data=cached_movies, msg='success') try: spider = Spider() movies = spider.get_movies(const.BASE_URL) for movie in movies: create_movie(movie) cached_movies = session.query(Movie).filter( Movie.id.between((page_now - 1) * page_size + 1, page_now * page_size)).all() return success_res(code=1000, data=cached_movies, msg='success') except: return err_res(code=0, msg='err')
def getOAUrl(name): """ @summary: 获取公众号链接地址 :param name: 公众号名称 :return: 公众号链接地址 or None """ try: oa_json = Spider().requests("http://top.aiweibang.com/user/getsearch", "POST", {'Kw': name}) oa_data = json.loads(oa_json)["data"] oa_id = oa_data['data'][0]['Id'] url = "http://top.aiweibang.com/article/%s" % oa_id return url except: return None
def initWebsite(website_type, website_url, content_url=''): """ @summary: 初始化订阅的网站页面(添加相应的js) :param website_type: 网站类型(0:普通网站地址;1:公众号地址) :param website_url: 网站地址 :param content_url: 内容地址 :return: """ if int(website_type) == 0 and not urlLegal(website_url): return {"res": InitResType.inlegalUrl, "code": ""} oa_name = "" if website_type == "1": oa_name = website_url website_url = getOAUrl(website_url) if website_url is None: return {"res": InitResType.OAnotfound, "code": ""} script_code = """ <script src="/static/Subpage/js/common.js"></script> <script src = "/static/Subpage/js/filterXpath.js"></script> <script src="/static/Subpage/js/websiteDiscriminate.js"></script> """ html = Spider().chromedriver(website_url) head = re.findall("(<head.*?>)", html) if len(head) > 0: head = head[0] html = html.replace( head, "%s%s" % (head, '<meta name="referrer" content="never"><link rel="stylesheet" type="text/css" href="/static/Subpage/css/sub.css">' )) html = html.replace( "</body>", "%s<input id = 'sub_info' type = 'hidden' detail='%s' website_url='%s' content_url='%s'></body>" % (script_code, oa_name, website_url, content_url)) return {"res": InitResType.success, "code": html}
from selenium import webdriver from comman.expert_csv import ExpertCSV from common.spider import Spider from model.expert import Expert url_list = ['https://blog.csdn.net/weixin_43570367?t=1'] csdn_one_page_title_count = 40 def get_writer(url): return url.split('/')[-1].split('?')[0] if __name__ == '__main__': spider = Spider( webdriver.Chrome(executable_path='../asset/chromedriver11')) driver = spider.driver() count = 1 csv = ExpertCSV() try: for url in url_list: driver.get(url) while True: for i in range(1, 41): # model 对象 expert = Expert() # 获取标题#.article-item-box:nth-child(1) > h4 > a title = spider.find_ele_by_css( f'.article-item-box:nth-child({i}) > h4 > a')