def get_script(url): html = requests.get(url).content tree = HTML(html) try: script_html = tree.cssselect(".scrolling-script-container")[0] script = "".join([text for text in script_html.itertext()]) return script except Exception as e: return None
##################### # 获取SF首页的标题 ##################### from lxml.etree import HTML import requests url = 'https://segmentfault.com/' css_selector = '.title>a' # 这是利用浏览器自动获取的,我甚至都不用知道它是什么意思 text = requests.get(url).text page = HTML(text) titles = [] for title in page.cssselect(css_selector): titles.append(title.text) print(titles) # 这一段程序写下来,不用动脑筋(无脑写),不消耗心智
def extract_links(toot): '''Extract all external links from a toot.''' html = HTML(toot['content']) all_links = html.cssselect('a') return [link.attrib['href'] for link in all_links if not link_is_internal(link)]