def get_submission_dates(self, arxiv_tree, queried_version): links = CSSSelector("div.submission-history")(arxiv_tree)[0] versions = {} #print "Parsing", links.text_content() for line in links.text_content().split("\n"): match = self.version_re.match(line) if match: version, d = match.group(1), match.group(2) d = datetime.datetime.strptime(d, '%a, %d %b %Y').date() versions[version] = d if queried_version == version: return {version: d} #print version, date return versions
def get_submission_dates(self, arxiv_tree, queried_version): links = CSSSelector("div.submission-history")(arxiv_tree)[0] versions = {} #print "Parsing", links.text_content() for line in links.text_content().split("\n"): match = self.version_re.match(line) if match: version, d = match.group(1), match.group(2) d = datetime.datetime.strptime(d,'%a, %d %b %Y').date() versions[version] = d if queried_version == version: return {version: d} #print version, date return versions
def get_submission_dates(self, arxiv_tree, queried_version): links = CSSSelector("div.submission-history")(arxiv_tree)[0] #print("links are", links) versions = {} blob = self.clean_gunky_arxiv_data(links.text_content()) #print( "Parsing", blob) for line in blob.split("\n"): match = self.version_re.match(line) if match: version, d = match.group(1), match.group(2) d = datetime.datetime.strptime(d, '%a, %d %b %Y').date() versions[version] = d if queried_version == version: return {version: d} #print(version, date) return versions
def _get_post_details(post_listing): """Scrape a post and return as a Post object.""" title_node = CSSSelector("a.topictitle")(post_listing)[0] title = title_node.text_content() url = _forum_url + title_node.get("href")[2:] if rp.can_fetch("*", url): print "Scraping post: " + title post_page = lxml.html.fromstring(_get_page(url)) author = _get_post_author(post_page) content = _get_post_content(post_page) images = _get_post_images(post_page) privateMessageLink = _get_private_message_link(post_page) return Post(title, author, url, content, images, privateMessageLink) else: _robots_not_allowed(url) return None
import requests import lxml.html from lxml.cssselect import CSSSelector # get page url = sys.argv[1] page = requests.get(url).text page = page.replace('\xa0', ' ') tree = lxml.html.fromstring(page) # get title title_tag = CSSSelector('div#main h1')(tree)[0] title = title_tag.text_content() fb2 = title.find(' (fb2)') if fb2 != -1: title = title[:fb2] # get text text_tag = CSSSelector('div#main div._ga1_on_')(tree)[0] text = text_tag.text_content().strip() # get refs ref_sup_tags = CSSSelector('sup')(text_tag) ref_tags = [CSSSelector('a')(ref_sup_tag)[1] for ref_sup_tag in ref_sup_tags] refs = [ref_tag.get('title').strip() for ref_tag in ref_tags]