def _extract_table(self, html): """parse html page and process the calendar table intermediate python structure (event list) is returned""" soup = BeautifulSoup(html) tables = soup.html.body.findAll(name="table", recursive=False) # jump to the calendar table cal = tables[1] lines = cal.findAll(name="tr", recursive=False) # isolate first tab line with hours hours_line = lines[0].findChildren(name="td", recursive=False)[1:] hours = insert_halfhour_slots_and_convert_to_datetime(hours_line) # process all lines # search the number of row for that day n_rows = [] for (no_line, line) in enumerate(lines[1:]): slots = line.findAll(name="td", recursive=False) # search the number of row for that day if slots[0].has_key("rowspan"): n_rows.append(int(slots[0]["rowspan"])) else: n_rows.append(0) event_list = [] day = -1 n = 0 for (no_line, line) in enumerate(lines[1:]): if not n: n = n_rows[no_line] day += 1 current_time = -1 else: current_time = 0 n -= 1 slots = line.findAll(name="td", recursive=False) for s in slots: cell = s.findAll(name="table", recursive=False) # event found if len(cell) > 1: event_data = {"day": day, "start_time": hours[current_time], "duration": int(s["colspan"])} # duration in hours is extract from the colspan # compute end time (1 colspan=1/2 hour) delta = timedelta(hours=event_data["duration"] / 2) event_data["stop_time"] = hours[current_time] + delta td = cell[0].tr.findAll(name="td", recursive=False) # Gehol weeks when the event occurs event_data["weeks"] = split_weeks(td[0].contents[0].string) # location event_data["location"] = td[1].contents[0].string if not event_data["location"]: event_data["location"] = "" # activity event_data["type"] = cell[1].tr.td.contents[0].string current_time = current_time + event_data["duration"] event_data["organizer"] = "" event_data["title"] = "%s - %s" % (self.metadata["mnemo"], self.metadata["title"]) course_event = CourseEvent(**event_data) event_list.append(course_event) else: current_time += 1 return event_list
def _process_event(self, object_cell, starting_hour, num_day): num_timeslots = int(object_cell["colspan"]) cell_tables = object_cell.findChildren("table", recursive=False) # event box : 3 tables, one per line : # - location/weeks # - title # - tutor/course type location_weeks_table, title_table, tutor_type_table = cell_tables location = location_weeks_table.tr.findChildren("td")[0].text course_weeks = location_weeks_table.tr.findChildren("td")[1].text course_title = title_table.tr.td.text children = tutor_type_table.findChildren("td") course_tutor = children[0].text course_group = children[1].text course_type = children[2].text return { "type": course_type, "location": location, "organizer": "", "title": course_title, "lecturer": course_tutor, "group": course_group, "weeks": split_weeks(course_weeks), "num_timeslots": num_timeslots, "start_time": starting_hour, "stop_time": starting_hour + timedelta(hours=self._convert_num_timeslots_to_hours(num_timeslots)), "day": num_day, }
def _process_event(self, object_cell, starting_hour, num_day): num_timeslots = int(object_cell['colspan']) cell_tables = object_cell.findChildren('table', recursive=False) # event box : 3 tables, one per line : # - location/weeks # - title # - tutor/course type location_weeks_table, title_table, tutor_type_table = cell_tables location = location_weeks_table.tr.findChildren('td')[0].text course_weeks = location_weeks_table.tr.findChildren('td')[1].text course_title = title_table.tr.td.text children = tutor_type_table.findChildren('td') course_tutor = children[0].text course_group = children[1].text course_type = children[2].text return { 'type':course_type, 'location':location, 'organizer':"", 'title':course_title, 'lecturer':course_tutor, 'group':course_group, 'weeks':split_weeks(course_weeks), 'num_timeslots':num_timeslots, 'start_time':starting_hour, 'stop_time':starting_hour + timedelta(hours = self._convert_num_timeslots_to_hours(num_timeslots)), 'day':num_day }
def _process_event(self, object_cell, starting_hour, num_day): num_timeslots = int(object_cell['colspan']) cell_tables = object_cell.findChildren('table', recursive=False) # event box : 3 tables, one per line : # - weeks/location/type # - mnemo # - title first, second, third = cell_tables course_weeks = first.tr.findChildren('td')[0].text location = first.tr.findChildren('td')[1].text course_type = first.tr.findChildren('td')[2].text course_mnemo = second.tr.td.text course_title = third.tr.td.text return { 'type':course_type, 'location':location, 'organizer':"", 'title':course_title, 'mnemo':course_mnemo, 'weeks':split_weeks(course_weeks), 'num_timeslots':num_timeslots, 'start_time':starting_hour, 'stop_time':starting_hour + timedelta(hours = self._convert_num_timeslots_to_hours(num_timeslots)), 'day':num_day }
def _extract_table(self, html): '''parse html page and process the calendar table intermediate python structure (event list) is returned''' soup = BeautifulSoup(html) tables = soup.html.body.findAll(name='table',recursive=False) #jump to the calendar table cal = tables[1] lines = cal.findAll(name='tr',recursive=False) #isolate first tab line with hours hours_line = lines[0].findAll(name='td',recursive=False) hours = [] for h in hours_line[1:]: if h.string: hours.append(convert_time(h.string)) else: last_added_hour = hours[-1] hours.append(datetime(last_added_hour.year, last_added_hour.month, last_added_hour.day, last_added_hour.hour, 30)) #process all lines #search the number of row for that day n_rows = [] for (no_line,line) in enumerate(lines[1:]): slots = line.findAll(name='td',recursive=False) #search the number of row for that day if slots[0].has_key('rowspan'): n_rows.append(int(slots[0]['rowspan'])) else: n_rows.append(0) event_list = [] day = -1 n = 0 for (no_line,line) in enumerate(lines[1:]): if not n: n = n_rows[no_line] day += 1 current_time = -1 else: current_time = 0 n -= 1 slots = line.findAll(name='td',recursive=False) for s in slots: cell = s.findAll(name='table',recursive=False) # event found if len(cell)>1: event_data = { 'day': day, 'start_time': hours[current_time], 'duration': int(s['colspan']) } #duration in hours is extract from the colspan #compute end time (1 colspan=1/2 hour) delta = timedelta(hours=event_data['duration']/2) event_data['stop_time'] = hours[current_time]+delta td = cell[0].tr.findAll(name='td',recursive=False) # Gehol weeks when the event occurs event_data['weeks'] = split_weeks(td[0].contents[0].string) # location event_data['location'] = td[1].contents[0].string if not event_data['location']: event_data['location'] = '' # activity event_data['type'] = cell[1].tr.td.contents[0].string current_time = current_time + event_data['duration'] event_data['organizer'] = "" event_data['title'] = "%s - %s" % (self.metadata['mnemo'], self.metadata['title']) course_event = CourseEvent(**event_data) event_list.append(course_event) else: current_time += 1 return event_list