Python clean_course示例，utils.clean_course Python示例

示例#1

0

显示文件

    def scrape_courses(self):
        """
        Scrape the course table on the current browser page.
        Also, make a request to scrape the description from the course detail
        page as well.
        """
        # Skip the first row cause its a header row.
        rows = self.browser.find_elements_by_xpath('//table/tbody/tr')[1:]

        # Each row has 4 column.
        # Course (link to detail) | Category | Title | Credits
        for row in rows:
            abbrev, category, title, credits = row.find_elements_by_tag_name(
                'td')
            course_link = abbrev.find_element_by_tag_name('a')
            # filter out fake rows
            if abbrev.text == 'category':
                continue
            resp = requests.get(course_link.get_attribute('href'))
            soup = bs4.BeautifulSoup(resp.text, 'lxml')
            desc = soup.select('div.location-result p')[0]

            course = {
                'course': abbrev.text.upper(),
                'title': title.text.title(),
                'category': category.text.title(),
                'credits': credits.text,
                'description': desc.text
            }
            final = utils.clean_course(course)
            print final['course'], '-', final['title']
            self.courses.append(utils.clean_course(final))

示例#2

0

显示文件

    def run(self):
        """
        There's only one page for Lasell College with all the info on it,
        so this function does all that.
        """
        url = ('http://www.lasell.edu/academics/academic-catalog'
               '/undergraduate-catalog/course-descriptions.html')

        web_page = urllib2.urlopen(url).read()
        soup = bs4.BeautifulSoup(web_page, 'lxml')

        # ALl the courses are in the #tab-3 element. The element is
        # structured very neatly:
        # <h4> --> title
        # <p>  --> description
        courses_titles = soup.select('#tab-3 h4')
        for title in courses_titles:
            course = {}
            course['title'] = title.text.strip()
            # Find the department.
            department = re.search(r'([A-Z]{2,4})[0-9]', course['title'])
            if department:
                abbrev = department.groups(0)[0]
                course['department'] = self.department_map.get(abbrev)
            else:
                course['department'] = None

            desc = title.find_next_sibling()
            if desc:
                course['description'] = desc.text.strip()
            else:
                course['description'] = None
            self.courses.append(utils.clean_course(course))

示例#3

0

显示文件

    def scrape_courses(self, soup):
        """
        The site is laid out the following way:
        <p class="Heading-6-Courses"> -> has the course title and credits
        <p class="Course-Body"> -> description (sometimes more than one of
        these)
        """
        category = soup.select('#contentBody h1')[0].text.encode('ascii',
                                                                 'ignore')
        category = re.search(
            r'Course Catalog: (.*?) Courses', category
        ).groups(0)[0]
        div = soup.select('#contentLeft')[0]
        course = {}
        for child in div.children:
            if isinstance(child, bs4.element.NavigableString):
                continue
            if child.attrs == {'class': ['Heading-6-Courses']}:
                # New course.
                # We should close out the old course, and then parse the new
                # one.
                if course and course['title'] not in self.course_set:
                    course = utils.clean_course(course)
                    self.courses.append(course)
                    self.course_set.add(course['title'])

                # This element should look like this:
                # course abbreviation<br>title<br>credits
                contents = list(child.children)[0]
                contents = [a.string.encode('ascii', 'ignore')
                            for a in contents
                            if isinstance(a, bs4.element.NavigableString)]
                assert len(contents) == 3
                title = '{}: {}'.format(contents[0], contents[1])
                creds = re.sub(r' [cC]redit[s]{0,1}(.*?)$', '', contents[2])
                creds = re.sub(r'[ ]{0,1}-[ ]{0,1}', ' to ', creds)
                course = {
                    'category': category,
                    'title': title,
                    'credits': creds
                }
            elif ((child.attrs == {'class': ['Course-Body']} or
                    child.attrs == {'class': ['Body-Text']}) and
                    course):
                desc = child.text.encode('ascii', 'ignore')
                if 'description' in course:
                    course['description'] += ' ' + desc
                else:
                    course['description'] = desc

示例#4

0

显示文件

文件： spider.py 项目： AnabellJimenez/ALEX_Courses

    def parse_course(self, response):
        course = {}

        course['title'] = response.xpath(
            '//span[@id="detail_title"]/text()').extract()[0]

        course['description'] = response.xpath(
            '//p[@id="detail_description"]/text()').extract()[0]

        # There are 5 tables on every course page
        # the interesting data is in the second row of each table
        tables = response.xpath('//div[@id="detail"]/table')

        # Table 1: School, Department, Faculty
        row1_cols = tables[0].xpath('tr[2]/td')
        course['school'] = row1_cols[0].xpath('text()').extract()[0]
        course['department'] = row1_cols[1].xpath('text()').extract()[0]
        course['faculty'] = row1_cols[2].xpath('span/text()').extract()[0]

        # Table 2: Term, Day and Time
        row2_cols = tables[1].xpath('tr[2]/td')
        course['term'] = row2_cols[0].xpath('text()').extract()[0]
        # day and time has some weird spacing, fix it
        day_and_time_raw = row2_cols[1].xpath('text()').extract()[0]
        day_and_time_raw = [
            s.encode('ascii', 'ignore') for s in day_and_time_raw.split('\t')
            if s
        ]
        course['day_and_time'] = ' '.join(day_and_time_raw)

        # Table 3: Credits, Credit Level
        row3_cols = tables[2].xpath('tr/td')
        course['credits'] = row3_cols[0].xpath('text()').extract()[0]
        course['credit_level'] = row3_cols[1].xpath('text()').extract()[0]

        # self.course_list.append(utils.clean_course(course))
        # self.courses[course['title']] = course
        final = utils.clean_course(course)
        final['url'] = response._url
        yield final

示例#5

0

显示文件

    def scrape_courses(self, soup):
        """
        This site is a mess!
        """

        section = soup.find('div', {'id': 'MainContent_0_0_pnlDiv'})
        if section is None:
            return
        items = section.find_all('p')
        for item in items:
            all_text = []
            for s in item.children:
                if isinstance(s, bs4.element.NavigableString):
                    cleaned = s.string.encode('ascii', 'ignore').strip()
                    all_text.append(cleaned)
                else:
                    for a in s.contents:
                        if isinstance(a, bs4.element.NavigableString):
                            cleaned = a.string.encode('ascii',
                                                      'ignore').strip()
                            all_text.append(cleaned)
                        else:
                            text = [
                                c.string.encode('ascii', 'ignore').strip()
                                for c in a.contents
                                if isinstance(c, bs4.element.NavigableString)
                            ]
                            all_text.append(': '.join(text))
            all_text = [a for a in all_text if a]
            if all_text:
                course = {}
                if len(all_text) > 2:
                    t1, t2 = all_text[:2]
                    course['title'] = '{}: {}'.format(t1, t2)
                    all_text = all_text[2:]
                else:
                    course['title'] = all_text.pop(0)
                course['desc'] = ''.join(all_text)
                self.courses.append(utils.clean_course(course))

示例#6

0

显示文件

文件： spider.py 项目： AnabellJimenez/ALEX_Courses

    def scrape_course(self, soup, college, department):
        course = {}
        section = soup.select('#main')[0]
        title = section.find('h1')
        description = section.find('div', {'class': 'desc'})
        credits = section.find('div', {'class': 'credits'})
        others = section.find_all('h3')
        others.pop(0)
        if others:
            indices = [(h3, section.contents.index(h3)) for h3 in others
                       if h3 in section.contents]
            for i, (h3, h3_index) in enumerate(indices):
                if i == len(indices) - 1:
                    end = len(section.contents)
                else:
                    end = indices[i + 1][1]
                if h3_index == end - 1:
                    contents = [section.contents[end]]
                else:
                    contents = section.contents[h3_index + 1:end]
                final_contents = []
                for c in contents:
                    if isinstance(c, bs4.element.NavigableString):
                        final_contents.append(c.string.strip())
                    elif c.name == 'div':
                        pass
                    else:
                        final_contents.append(c.text.strip())
                field = h3.text.strip().encode('ascii', 'ignore')
                self.fields.add(field)
                course[field] = ' '.join(final_contents)

        course['college'] = college
        course['department'] = department
        course['title'] = title.text.strip()
        course['description'] = description.text.strip()
        course['credits'] = credits.text.strip() if credits else None
        print course['title']
        self.courses.append(utils.clean_course(course))

示例#7

0

显示文件

    def scrape_course(self, row, category):
        course = {}
        info = row.find('a')
        title = info.text
        credits = re.search(r'(\d)\s?cr', title)
        if credits:
            course['credits'] = credits.group(0).replace('cr', '').strip()
            course['title'] = title.replace(credits.group(0), '').strip()
        else:
            course['credits'] = None
            course['title'] = title
        course['category'] = category
        course['link'] = '{}/{}'.format(self.base_url, info.attrs['href'])

        desc_page = urllib2.urlopen(course['link']).read()
        soup = bs4.BeautifulSoup(desc_page, 'lxml')
        td = soup.find('td', {'class': 'block_content'})
        # We need to be smart about how we grab the description cause its not
        # very organized.
        # We only want elements that contain description text.
        desc = []
        for i, c in enumerate(td.contents):
            if c.name in ('h1', 'table', 'div'):
                pass
            elif c.name == 'br' and td.contents[i + 1].name == 'br':
                break
            else:
                if isinstance(c, bs4.element.NavigableString):
                    text = c.string
                else:
                    text = c.text
                desc.append(text.strip().encode('ascii', 'ignore'))

        # desc = [c.string.strip().encode('ascii','ignore') for c in td.contents
        #         if isinstance(c, bs4.element.NavigableString) and
        #         c.string.strip().encode('ascii','ignore')]
        course['description'] = ' '.join(desc).replace(' ,', ',').strip()
        print course['title'], ':', course['credits']
        self.courses.append(utils.clean_course(course))

示例#8

0

显示文件

    def parse_course(self, response):
        """
        Scrape the contents for an individual course.
        """
        course = {}
        course['title'] = response.xpath(
            '//h1/a[@class="title"]/text()').extract()[0]
        course['category'] = response.xpath(
            '//div[@class="Breads"]/span/text()').extract()[0]

        # The description element manifests differently on every course page!
        desc_all = response.xpath(
            '//span[@class="text"]/descendant-or-self::*/text()')
        # Filter line breaks and other random artifacts.
        desc_extracted = [
            c.extract().strip().replace('\r\n', '').encode('ascii', 'ignore')
            for c in desc_all
        ]
        # Filter out known unnecessary information.
        desc_filtered = [
            c for c in desc_extracted[:-1]
            if 'Credit Hours' not in c and 'Course Descriptions' not in c
            and c != course['title'] and c != ''
        ]
        # Separate out prerequisites, if there are any.
        prerequisites = [
            c for c in desc_filtered if c.startswith('Prerequisite')
        ]
        if prerequisites:
            course['prerequisite'] = prerequisites[0]
            desc_filtered.remove(course['prerequisite'])
        else:
            course['prerequisite'] = None
        course['description'] = '; '.join(desc_filtered)
        print course['title']
        yield utils.clean_course(course)

示例#9

0

显示文件

def scrape():
    html = open('necc_spring_2017.html', 'r')
    soup = bs4.BeautifulSoup(html, 'html.parser')
    url = 'https://ssb.necc.mass.edu:9030'

    rows = soup.select('table.datadisplaytable tr')

    # Every odd row is the title of the course
    # Every even row is the info about the course
    # So we need to parse them as couples

    courses = []

    for i in range(0, len(rows) - 1, 2):
        course = {}
        # TITLE
        course['title'] = rows[i].text.strip().encode('ascii', 'ignore')
        course['link'] = url + rows[i].find('a').attrs['href']

        # DESCRIPTION
        # text in wrapper <td> element, before <br>
        td = rows[i + 1].find('td')
        desc = td.next_element
        course['description'] = desc.strip()
        if desc.next_element.name == 'b':
            # PREREQUISITES
            # looks like "Prerequisite(s): bunch of stuff
            # just want stuff after the first colon
            # also in some cases, there is a newline with non-prereq info.
            # cut that out as well.
            all_prereq_text = desc.next_element.text.strip()
            just_prereqs = ''.join(
                all_prereq_text.split(': ')[1:]).split('\n')[0]
            course['prerequisites'] = just_prereqs
        else:
            course['prerequisites'] = None

        # CREDITS
        # ???

        # LEVELS
        spans = td.select('.fieldlabeltext')
        try:
            course['levels'] = spans[0].next_sibling.strip()
        except IndexError:
            course['levels'] = None

        # SCHEDULE TYPE
        # first text after the second span
        # sometimes they are wrapped in <a>, sometimes not
        try:
            if spans[1].next_sibling.name == 'a':
                items = spans[1].find_next_siblings('a')
                course['schedule_type'] = ', '.join(
                    [s.string.strip() for s in items])
            else:
                course['schedule_type'] = spans[1].next_sibling.string.strip()
        except IndexError:
            course['schedule_type'] = None

        # DEPARTMENT
        # usually after a newline, always has word Department
        # extract just the important bit
        department_matches = re.search('\\n[ a-zA-Z&]{1,}Department', td.text)
        if department_matches:
            department = re.sub('\\n[ ]{1,}', '', department_matches.group(0))
            course['department'] = department.replace(' Department', '')
        else:
            course['department'] = None

        print course['title']
        courses.append(utils.clean_course(course))

    return courses

示例#10

0

显示文件

文件： spider.py 项目： AnabellJimenez/ALEX_Courses

    def scrape_courses(self, soup):
        """
        Parses a page of courses under a particular letter and extracts the
        course information.
        :param soup: BeautifulSoup of a web page under a letter.
        """

        # Page layout resembles the following:
        # <h2> -> course category, one of these has multiple courses under it
        # <h3> - > course title
        # <p> -> course description
        # <div class="tablewrap"> -> course details
        # <div class="separator"> -> separates courses

        # We want to start with the first h2 element, then traverse the
        # section by moving on to the next sibling and reacting based on the
        # type of element.

        course = {}
        cur_category = None
        cur_course_name = None
        for element in soup.find(id='simmonsmainBody').children:
            if element.name == 'h2':
                cur_category = element.text
                course['category'] = cur_category
            elif element.name == 'h3':
                course['title'] = element.text
            elif element.name == 'p':
                course['description'] = element.text
            elif (element.name == 'div' and element.attrs == {
                    'class': ['tablewrap']
            }):
                # Within this element is a table with the following columns:
                # Section, Dates, Days, Times, Room, Instructor,
                # Section Status, Avail Seats, Requires Consent, Credits

                # Sometimes this table has multiple rows for different
                # sections of the course. These should be treated as separate
                # courses for now.
                rows = element.find_all('tr')
                if not rows:
                    continue

                # First row is table headers - skip them
                for row in rows[1:]:
                    cols = row.find_all('td')
                    # if the instructor column has a <br> element, we should join
                    # the names with a ,
                    course['section'], \
                        course['dates'], \
                        course['days'], \
                        course['times'], \
                        course['room'], \
                        course['section status'], \
                        course['avail seats'], \
                        course['requires consent'], \
                        course['credits'] = [c.text.strip() for c in cols
                                             if cols.index(c) != 5]
                    course['instructor'] = ', '.join([
                        c.string.strip() for c in cols[5].children
                        if isinstance(c, bs4.element.NavigableString)
                    ])
                    course = utils.clean_course(course)
                    self.courses.append(course)
            elif (element.name == 'div' and element.attrs == {
                    'class': ['separator']
            } and course):
                course = {'category': cur_category}