def scrape_personal_calendar_html(self, response): meta, html_data = self.html_response(response) yield Download( meta=meta._asdict(), data={ 'title': html_data.css('#content_title > h2::text').extract_first() }) self.log_done(meta)
def scrape_course_selection_json(self, response): meta, json_data = self.json_response(response) class_s = jmespath.search('*[].*[].*[].*[].*[].*[].studiengruppen_id', json_data) self.log_select(meta, Key.CLASS, class_s) for class_ in class_s: yield self.class_selection_json_request(meta.context[Key.SEMESTER], class_, self.scrape_json) yield Download(meta=meta._asdict(), data=json_data) self.log_done(meta)
def scrape_location_calendar_html(self, response): meta, html_data = self.html_response(response) yield Download( meta=meta._asdict(), data={ 'title': html_data.css('#content_title > h2::text').extract_first(), 'subtitle': html_data.css( '#content_subtitle > div > div:nth-child(1)::text' ).extract_first(), 'description': html_data.css( '#content_subtitle > div > div:nth-child(2)::text'). re_first(r'Beschreibung: (.*)$'), 'type': html_data.css( '#content_subtitle > div > div:nth-child(3)::text'). re_first(r'Raumtyp: (.*)$') }) self.log_done(meta)
def scrape_semester_json(self, response): meta, json_data = self.json_response(response) # select semester ids if self.all: self.logger.info('Start scraping all semesters') semesters = jmespath.search('[].id', json_data) else: self.logger.info('Start scraping current semester') semesters = jmespath.search('[? isaktuelles ==`true`].id', json_data) self.log_select(meta, Key.SEMESTER, semesters) for semester in semesters: yield self.index_html_request(semester, self.scrape_index_html) yield self.course_selection_json_request( semester, self.scrape_course_selection_json) yield self.personal_calendar_html_request( semester, self.scrape_personal_calendar_html) yield self.personal_calendar_json_request(semester, self.scrape_json) yield Download(meta=meta._asdict(), data=json_data) self.log_done(meta)
def scrape_index_html(self, response): meta, html_data = self.html_response(response) # select course ids (without first -1 value) courses = html_data.css("#cbstg > option:not(:first-child)").css( '::attr(value)').extract() self.log_select(meta, Key.COURSE, courses) for course in courses: yield self.course_json_request(meta.context[Key.SEMESTER], course, self.scrape_course_json) # select location ids (without first -1 value) locations = html_data.css("#cbraum > option:not(:first-child)").css( "::attr(value)").extract() self.log_select(meta, Key.LOCATION, locations) for location in locations: yield self.location_calendar_html_request( meta.context[Key.SEMESTER], location, self.scrape_location_calendar_html) yield self.location_calendar_json_request( meta.context[Key.SEMESTER], location, self.scrape_json) # --- stundenraster = [] for line in html_data.css('head > script:last_child::text').re( r'stundenraster\[\d+\] = \[(.*)\];'): m = re.fullmatch( r'\'(?P<starts>\d{2}.\d{2})\', \'(?P<ends>\d{2}.\d{2})\', \'(?P<slot>\d+)\'', line) if m: stundenraster.append({ 'slot': m.group('slot'), 'starts': m.group('starts'), 'ends': m.group('ends'), }) cbraum = [] for (i, element) in enumerate( html_data.css("#cbraum > option:not(:first-child)")): cbraum.append({ 'id': element.css('::attr(value)').extract_first(), 'title': element.css('::attr(title)').extract_first(), 'name': element.css('::text').extract_first() }) js_variables = html_data.css('head > script:last_child::text') data = { 'title': html_data.css('head > title::text').extract_first(), 'indexLink': js_variables.re_first(r'indexLink = \'(.*)\';'), 'frontendDir': js_variables.re_first(r'frontendDir = \'(.*)\';'), 'stplIndexLink': js_variables.re_first(r'STPL.IndexLink = \'(.*)\';'), 'vorlesungsanfang': js_variables.re_first(r'Vorlesungsanfang = \'(.*)\';'), 'vorlesungsende': js_variables.re_first(r'Vorlesungsende = \'(.*)\';'), 'semesteranfang': js_variables.re_first(r'Semesteranfang = \'(.*)\';'), 'semesterende': js_variables.re_first(r'Semesterende = \'(.*)\';'), 'stundenraster': stundenraster, 'cbraum': cbraum } # --- yield Download(meta=meta._asdict(), data=data) self.log_done(meta)
def scrape_json(self, response): meta, json_data = self.json_response(response) yield Download(meta=meta._asdict(), data=json_data) self.log_done(meta)