def scrape(self): if self.semester.type == Semester.FALL: year = self.semester.year else: year = self.semester.year - 1 code_re = re.compile('/studier/emner/([^/]+)/', re.I|re.L) url = 'http://www.ntnu.no/web/studier/emnesok' query = { 'p_p_lifecycle': '2', 'p_p_id': 'courselistportlet_WAR_courselistportlet_INSTANCE_m8nT', '_courselistportlet_WAR_courselistportlet_INSTANCE_m8nT_year': year} courses_root = fetch.html(url, query=query, verbose=True) for a in courses_root.cssselect('a[href*="/studier/emner/"]'): course_url = a.attrib['href'] code = code_re.search(course_url).group(1) quoted_code = urllib.quote(code.encode('utf-8')) name = a.text_content() if not ntnu.valid_course_code(code): continue elif not self.should_proccess_course(code): continue title = None data = {} root = fetch.html( 'http://www.ntnu.no/studier/emner/%s/%s' % (quoted_code, year)) # Construct dict out of info boxes. for box in root.cssselect('.infoBox'): for child in box.getchildren(): if child.tag == 'h3': title = child.text_content() else: parts = [child.text or ''] for br in child.getchildren(): parts.append(br.tail or '') for key, value in [p.split(':', 1) for p in parts if ':' in p]: key = key.strip(u' \n\xa0') value = value.strip(u' \n\xa0') data.setdefault(title, {}).setdefault(key, []).append(value) try: semesters = data['Undervisning']['Undervises'] except KeyError: continue if self.semester.type == Semester.FALL and u'HØST %s' % year not in semesters: continue elif self.semester.type == Semester.SPRING and u'VÅR %s' % year not in semesters: continue yield {'code': code, 'name': name, 'version': int(data['Fakta om emnet']['Versjon'][0]), 'points': float(data['Fakta om emnet']['Studiepoeng'][0]), 'url': course_url}
def fetch_courses(semester, prefix=None): courses = fetch.json(BASE + '/course/-')['course'] for course in courses: if not ntnu.valid_course_code(course['code']): logging.warning('Skipped invalid course name: %s', course['code']) continue # TODO: shouldn't reimplement should_proccess_course if prefix and not course['code'].startswith(prefix): continue result = fetch_course(course['code']) if not result: continue if semester.year < result['taughtFromYear']: continue if result['lastYearTaught'] and semester.year > result['lastYearTaught']: continue if result['versionCode'] != course['versionCode']: continue if semester.type == semester.FALL and result['taughtInAutumn']: yield result elif semester.type == semester.SPRING and result['taughtInSpring']: yield result
def scrape(self): prefix = ntnu.prefix(self.semester, template='{year}{letter}') url = 'http://www.ntnu.no/eksamen/plan/%s/dato.XML' % prefix courses = Course.objects.filter(semester=self.semester) courses = {c.code: c for c in courses} root = fetch.xml(url) if root is None: return for row in root.xpath('//dato/dato_row'): course_code = get(row, 'emnekode') course_version = get(row, 'versjonskode') status_code = get(row, 'vurdstatuskode') if status_code != 'ORD': continue elif not ntnu.valid_course_code(course_code): logging.warning("Invalid course code: %s", course_code) continue elif course_code not in courses: logging.debug("Unknown course %s.", course_code) continue elif not self.should_proccess_course(code): continue combination = get(row, 'vurdkombkode') duration = get(row, 'varighettimer') exam_date = get(row, 'dato_eksamen') exam_semester = get(row, 'terminkode_gjelder_i') exam_time = get(row, 'klokkeslett_fremmote_tid') exam_year = get(row, 'arstall_gjelder_i') handin_date = get(row, 'dato_innlevering') handin_time = get(row, 'klokkeslett_innlevering') handout_date = get(row, 'dato_uttak') handout_time = get(row, 'klokkeslett_uttak') type_code = get(row, 'vurderingsformkode') type_name = get(row, 'vurderingskombinasjon_vurdkombnavn_bokmal') if not type_code: logging.warning('Missing exam type for %s', course_code) continue yield { 'course': courses[course_code], 'exam_date': utils.parse_date(handin_date or exam_date), 'exam_time': utils.parse_time(handin_time or exam_time), 'combination': combination, 'handout_date': utils.parse_date(handout_date), 'handout_time': utils.parse_time(handout_time), 'type': self.exam_type(type_code, type_name), 'duration': duration }
def scrape(self): prefix = ntnu.prefix(self.semester, template='{year}{letter}') url = 'http://www.ntnu.no/eksamen/plan/%s/dato.XML' % prefix courses = Course.objects.filter(semester=self.semester) courses = {c.code: c for c in courses} root = fetch.xml(url) if root is None: return for row in root.xpath('//dato/dato_row'): course_code = get(row, 'emnekode') course_version = get(row, 'versjonskode') status_code = get(row, 'vurdstatuskode') if status_code != 'ORD': continue elif not ntnu.valid_course_code(course_code): logging.warning("Invalid course code: %s", course_code) continue elif course_code not in courses: logging.debug("Unknown course %s.", course_code) continue elif not self.should_proccess_course(code): continue combination = get(row, 'vurdkombkode') duration = get(row, 'varighettimer') exam_date = get(row, 'dato_eksamen') exam_semester = get(row, 'terminkode_gjelder_i') exam_time = get(row, 'klokkeslett_fremmote_tid') exam_year = get(row, 'arstall_gjelder_i') handin_date = get(row, 'dato_innlevering') handin_time = get(row, 'klokkeslett_innlevering') handout_date = get(row, 'dato_uttak') handout_time = get(row, 'klokkeslett_uttak') type_code = get(row, 'vurderingsformkode') type_name = get(row, 'vurderingskombinasjon_vurdkombnavn_bokmal') if not type_code: logging.warning('Missing exam type for %s', course_code) continue yield {'course': courses[course_code], 'exam_date': utils.parse_date(handin_date or exam_date), 'exam_time': utils.parse_time(handin_time or exam_time), 'combination': combination, 'handout_date': utils.parse_date(handout_date), 'handout_time': utils.parse_time(handout_time), 'type': self.exam_type(type_code, type_name), 'duration': duration}
def fetch_courses(semester): courses = fetch.json('http://www.ime.ntnu.no/api/course/-')['course'] for course in courses: if not ntnu.valid_course_code(course['code']): logging.warning('Skipped invalid course name: %s', course['code']) continue result = fetch_course(course['code']) if not result: continue if semester.year < result['taughtFromYear']: continue if result['lastYearTaught'] and semester.year > result['lastYearTaught']: continue if semester.type == semester.FALL and result['taughtInAutumn']: yield result elif semester.type == semester.SPRING and result['taughtInSpring']: yield result
def scrape(self): if self.semester.type == Semester.FALL: year = self.semester.year else: year = self.semester.year - 1 code_re = re.compile('/studier/emner/([^/]+)/', re.I | re.L) url = 'http://www.ntnu.no/web/studier/emnesok' query = { 'p_p_lifecycle': '2', 'p_p_id': 'courselistportlet_WAR_courselistportlet_INSTANCE_m8nT', '_courselistportlet_WAR_courselistportlet_INSTANCE_m8nT_year': year } courses_root = fetch.html(url, query=query, verbose=True) for a in courses_root.cssselect('a[href*="/studier/emner/"]'): course_url = a.attrib['href'] code = code_re.search(course_url).group(1) quoted_code = urllib.quote(code.encode('utf-8')) name = a.text_content() if not ntnu.valid_course_code(code): continue elif not self.should_proccess_course(code): continue title = None data = {} root = fetch.html('http://www.ntnu.no/studier/emner/%s/%s' % (quoted_code, year)) # Construct dict out of info boxes. for box in root.cssselect('.infoBox'): for child in box.getchildren(): if child.tag == 'h3': title = child.text_content() else: parts = [child.text or ''] for br in child.getchildren(): parts.append(br.tail or '') for key, value in [ p.split(':', 1) for p in parts if ':' in p ]: key = key.strip(u' \n\xa0') value = value.strip(u' \n\xa0') data.setdefault(title, {}).setdefault(key, []).append(value) try: semesters = data['Undervisning']['Undervises'] except KeyError: continue if self.semester.type == Semester.FALL and u'HØST %s' % year not in semesters: continue elif self.semester.type == Semester.SPRING and u'VÅR %s' % year not in semesters: continue yield { 'code': code, 'name': name, 'version': int(data['Fakta om emnet']['Versjon'][0]), 'points': float(data['Fakta om emnet']['Studiepoeng'][0]), 'url': course_url }