Пример #1
0
    def get_submission_dates(self, arxiv_tree, queried_version):
        links = CSSSelector("div.submission-history")(arxiv_tree)[0]
        versions = {}
        #print "Parsing", links.text_content()
        for line in links.text_content().split("\n"):
            match = self.version_re.match(line)
            if match:
                version, d = match.group(1), match.group(2)
                d = datetime.datetime.strptime(d, '%a, %d %b %Y').date()
                versions[version] = d
                if queried_version == version:
                    return {version: d}
                #print version, date

        return versions
Пример #2
0
    def get_submission_dates(self, arxiv_tree, queried_version):
        links = CSSSelector("div.submission-history")(arxiv_tree)[0]
        versions = {}
        #print "Parsing", links.text_content()
        for line in links.text_content().split("\n"):
            match = self.version_re.match(line)
            if match:
                version, d = match.group(1), match.group(2)
                d = datetime.datetime.strptime(d,'%a, %d %b %Y').date()
                versions[version] = d
                if queried_version == version:
                    return {version: d}
                #print version, date

        return versions
Пример #3
0
    def get_submission_dates(self, arxiv_tree, queried_version):
        links = CSSSelector("div.submission-history")(arxiv_tree)[0]
        #print("links are", links)
        versions = {}
        blob = self.clean_gunky_arxiv_data(links.text_content())

        #print( "Parsing", blob)
        for line in blob.split("\n"):
            match = self.version_re.match(line)
            if match:
                version, d = match.group(1), match.group(2)
                d = datetime.datetime.strptime(d, '%a, %d %b %Y').date()
                versions[version] = d
                if queried_version == version:
                    return {version: d}
                #print(version, date)

        return versions
Пример #4
0
def _get_post_details(post_listing):
    """Scrape a post and return as a Post object."""

    title_node = CSSSelector("a.topictitle")(post_listing)[0]
    title = title_node.text_content()

    url = _forum_url + title_node.get("href")[2:]

    if rp.can_fetch("*", url):

        print "Scraping post: " + title

        post_page = lxml.html.fromstring(_get_page(url))

        author = _get_post_author(post_page)
        content = _get_post_content(post_page)
        images = _get_post_images(post_page)
        privateMessageLink = _get_private_message_link(post_page)

        return Post(title, author, url, content, images, privateMessageLink)
    else:
        _robots_not_allowed(url)
        return None
Пример #5
0
import requests
import lxml.html
from lxml.cssselect import CSSSelector


# get page
url = sys.argv[1]
page = requests.get(url).text
page = page.replace('\xa0', ' ')
tree = lxml.html.fromstring(page)


# get title
title_tag = CSSSelector('div#main h1')(tree)[0]
title = title_tag.text_content()
fb2 = title.find(' (fb2)')
if fb2 != -1:
    title = title[:fb2]


# get text
text_tag = CSSSelector('div#main div._ga1_on_')(tree)[0]
text = text_tag.text_content().strip()


# get refs
ref_sup_tags = CSSSelector('sup')(text_tag)
ref_tags = [CSSSelector('a')(ref_sup_tag)[1] for ref_sup_tag in ref_sup_tags]
refs = [ref_tag.get('title').strip() for ref_tag in ref_tags]