Exemplo n.º 1
0
def parse_subjects(response):
    '''
    Scrape for Courses.
    Iterates through Subjects in this page.
    Passes to parse_course, parse_course_details
    '''
    rows = response.xpath('//tbody/tr')
    for row in rows:
        next_rel_url = utils.extract_element(row.xpath('./td/a/@href'), 0)
        if len(next_rel_url) > 1:
            next_url = BASE_URL + next_rel_url
            next_url = re.sub(SESSION_PATTERN, '', next_url)
            title = utils.extract_element(row.xpath('./td[2]/text()'), 0)
            code = utils.extract_element(row.xpath('./td/a/text()'), 0)

            subject = {
                "url": next_url,
                "name": code + " " + title.strip(),
                "faculty": utils.extract_element(row.xpath('./td[3]/text()'),
                                                 0)
            }

            yield scrapy.Request(next_url,
                                 callback=parse_course,
                                 priority=100,
                                 meta={'data': subject})
Exemplo n.º 2
0
def parse_course(response):
    '''
    Parse subjct page.
    Iterates through the Courses on this page.
    '''
    subject = response.meta['data']
    rows = response.xpath('//tbody/tr')
    for row in rows:
        next_rel_url = utils.extract_element(row.xpath('./td/a/@href'), 0)
        course_code = utils.extract_element(row.xpath('./td/a/text()'), 0)
        course_title = utils.extract_element(row.xpath('./td[2]/text()'), 0)
        course_name = course_code + " " + course_title

        if len(next_rel_url) > 1:
            next_url = BASE_URL + next_rel_url
            next_url = re.sub(SESSION_PATTERN, '', next_url)

            course = ScrapyCourseItem(
                subject=subject,
                url=next_url,
                name=course_name,
            )

            yield scrapy.Request(next_url,
                                 callback=parse_course_details,
                                 priority=100,
                                 meta={'data': course})
Exemplo n.º 3
0
 def test_extract_element(self):
     """
     Test safe list extraction with default value ""
     """
     plist = [
         Selector(text="el1"),
         Selector(text="el2"),
         Selector(text="el3"),
     ]
     self.assertTrue(utils.extract_element(plist, 0))
     self.assertTrue(utils.extract_element(plist, 2))
     self.assertEqual(utils.extract_element(plist, 5), "")
Exemplo n.º 4
0
def parse_course_details(response):
    '''
    Parse course details page.
    '''
    course = response.meta['data']
    course['description'] = utils.extract_element(response.xpath('//p/text()'),
                                                  0).strip()
    # TODO: List of sections into course[sections], maybe parse as well
    return course
Exemplo n.º 5
0
def parse_post(response, links):
    '''
    Parses a reddit post's comment section
    '''

    post_section = response.xpath('//*[@id="siteTable"]')

    # check karma and discard if below post threshold
    karma = utils.extract_element(
        post_section.xpath('//div/div/div[@class="score unvoted"]/text()'), 0)
    if karma == "" or int(karma) < POST_KARMA_THRESHOLD:
        return

    # get post content
    post_content = utils.extract_element(
        post_section.xpath(
            '//div/div[@class="entry unvoted"]/div/form/div/div'), 0)
    post_content = utils.strip_content(post_content)

    # get post title
    titles = utils.extract_element(response.xpath('//title/text()'), 0)
    titles = titles.rsplit(':', 1)
    title = titles[0].strip()
    subreddit = titles[1].strip()

    # get comments
    comments = []
    comment_section = response.xpath('/html/body/div[4]/div[2]')
    comment_section = comment_section.xpath(
        '//div/div[@class="entry unvoted"]/form/div/div')
    for comment in comment_section:
        # TODO: get user of each comment as well?
        comment = utils.strip_content(comment.extract())
        if len(comment) > 0:
            comments.append(' '.join(c for c in comment))

    return ScrapyRedditPost(url=response.url,
                            title=title,
                            subreddit=subreddit,
                            post_content=post_content,
                            comments=comments,
                            links=links)
Exemplo n.º 6
0
def parse_generic_item(response, links):
    '''
    Scrape generic page
    '''
    title = utils.extract_element(response.xpath("//title/text()"), 0).strip()
    titles = re.split(r'\| | - ', title)

    # Use OpenGraph title data if available
    if len(response.xpath('//meta[@property="og:site_name"]')) > 0 and \
        len(response.xpath('//meta[@property="og:title"]')) > 0:
        title = utils.extract_element(
            response.xpath('//meta[@property="og:title"]/@content'), 0)
        site_title = utils.extract_element(
            response.xpath('//meta[@property="og:site_name"]/@content'), 0)
    elif len(titles) >= 2:
        title = titles[0].strip()
        site_titles = []
        for i in range(max(1, len(titles) - 2), len(titles)):
            site_titles.append(titles[i].strip())
        site_title = ' - '.join(site_titles)
    else:
        site_title = ''

    # Use OpenGraph description if available
    if len(response.xpath('//meta[@property="og:description"]')) > 0:
        desc = utils.extract_element(
            response.xpath('//meta[@property="og:description"]/@content'), 0)
    else:
        desc = utils.extract_element(
            response.xpath('//meta[@name="description"]/@content'), 0)

    raw_content = utils.strip_content(response.body)

    return ScrapyGenericPage(url=response.url,
                             title=title,
                             site_title=site_title,
                             description=desc,
                             raw_content=raw_content,
                             links=links)