def scrape_timetable(doc): modules = [] module = {} tables = doc.getiterator(scraper.tag("table")) for table in tables: attrs = set(table.items()) if attrs == set([ ("width", "100%"), ("border", "0"), ]): ## denotes end of timetable block and possible start of new heading if len(module) > 0: modules.append(module) module = {} bolds = list(table.getiterator(scraper.tag("b"))) if len(bolds) == 0: continue ## not a heading module_title = bolds[0].text if module_title.startswith("Module:"): (_, code, title) = module_title.split(" ") module['title'] = title module['code'] = code elif module_title.startswith("Programme:"): module['title'] = module_title.lstrip("Programme: ") module['code'] = "" else: continue module['acts'] = {} elif attrs == set([ ("cellspacing", "0"), ("cellpadding", "2%"), ("border", "1"), ]): ## timetable block rows = list(table.getiterator(scraper.tag("tr"))) hrow = map(lambda e:spelling(e.text), rows[0].getchildren()) for row in rows[1:]: activity = dict(zip(hrow, map(lambda c:c.text, row))) if activity['Name of Type'] not in activity_types: continue a = activity["Activity"] if a not in module['acts']: module['acts'][a] = dict(map(lambda (k,v): (k,[v]), activity.items())) else: for (k,v) in activity.items(): if v not in module['acts'][a][k]: module['acts'][a][k].append(v) for a in module['acts']: for k in module['acts'][a]: if len(module['acts'][a][k]) == 1: module['acts'][a][k] = module['acts'][a][k][0] else: pass return modules
def scrape_module_details(doc): SKILLS = ('Intellectual Skills', 'Professional Skills', 'Transferable Skills',) details = {} outcomes = False ps = doc.getiterator(scraper.tag("p")) for p in ps: for c in p.getchildren(): if c.tail and c.text: details[c.text.strip().strip(":")] = c.tail.strip() elif c.tail: cts = c.tail.strip() if not outcomes: continue if ("Knowledge and Understanding" in details and not details['Knowledge and Understanding']): if not cts.endswith("."): cts += "." details['Knowledge and Understanding'] = cts elif cts.startswith("Knowledge and Understanding"): details['Knowledge and Understanding'] = None else: for k in SKILLS: if k in details and len(details[k]) == 0: details[k] = cts elif cts.startswith(k): details[k] = cts.lstrip("%s[.:] " % k) elif c.text: if c.text.strip().startswith("Learning Outcomes"): outcomes = True for k in SKILLS: if k in details and not details[k].endswith("."): details[k] += "." elif k not in details: details[k] = '[No %s listed.]' % (k.lower(),) return details