def parse(self, response: scrapy.http.Response, **kwargs): """ Scrapy Contracts: @url http://ginkgomaps.com/landkarten_deutschland.html @returns items 1 """ # making sure that the current url is marked as parsed: self.debug_parsed_urls.add(response.url) # IMPORTANT: modern browsers add "tbody"-elements into tables, scrapy doesn't see those tags! # Remember: whatever request you see with the developer tools in your browser, you need to manually remove # ANY <tbody>-tag that sits inside your xpath expression, otherwise it will return an empty [] ! # response.xpath('/html/body/center/table[1]/tr[4]/td[3]/table[1]').get() # first index page contains 42 maps, all inside tables of the class "smalltable": # response.xpath('//table[@class="smalltable"]') table_body = response.xpath('//table[@class="smalltable"]') description_temp = str() first_thumbnail = str() if table_body is not None: for table_item in table_body: # print(table_item.get()) map_title = table_item.xpath('tr/td[1]/a[2]/text()').get() map_design_heading = table_item.xpath( 'tr/td[2]/u[1]/text()').get() map_design = table_item.xpath('tr/td[2]/p[1]/text()').get() map_content_heading = table_item.xpath( 'tr/td[2]/u[2]/text()').get() map_content = table_item.xpath('tr/td[2]/p[2]/text()').get() # map_thumbnail = response.urljoin(table_item.xpath('tr/td[1]/a[1]/img/@src').get()) # map_thumbnail_description = table_item.xpath('tr/td[1]/a[1]/img/@alt').get() # pdf_download_url = response.urljoin(table_item.xpath('tr/td[2]/p[3]/a[1]/@href').get()) # pdf_download_title = table_item.xpath('tr/td[2]/p[3]/a[2]/text()').get() # jpeg_download_medium_url = response.urljoin(table_item.xpath('tr/td[2]/p[4]/a[2]/@href').get()) # jpeg_download_medium_description = table_item.xpath('tr/td[2]/p[4]/a[2]/text()').get() # jpeg_download_high_url = response.urljoin(table_item.xpath('tr/td[2]/p[5]/a[2]/@href').get()) # jpeg_download_high_description = table_item.xpath('tr/td[2]/p[5]/a[2]/text()').get() description_temp += map_title + "\n" \ + map_design_heading + map_design \ + map_content_heading + map_content # while we could theoretically grab all thumbnails during the above loop, # the first one is enough for a preview-image in edu-sharing first_thumbnail = response.urljoin( table_body[0].xpath('tr/td[1]/a[1]/img/@src').get()) description_temp = w3lib.html.strip_html5_whitespace(description_temp) base = super().getBase(response=response) base.add_value('sourceId', response.url) last_modified = response.xpath('/html/head/meta[6]/@content').get() hash_temp = last_modified + self.version base.add_value('hash', hash_temp) base.add_value('type', Constants.TYPE_MATERIAL) if first_thumbnail is not None: base.add_value('thumbnail', first_thumbnail) base.add_value('lastModified', last_modified) lom = LomBaseItemloader() general = LomGeneralItemloader(response=response) general.add_value('language', 'de') general.add_value('identifier', response.url) # the description could be extended with additional infos about the map-formats and their resolutions, # (if necessary) general.add_value('description', description_temp) general.add_value('title', response.xpath('/html/head/title/text()').get()) # keywords are stored inside a String, separated by commas with (sometimes multiple) whitespaces, # therefore RegEx is needed to provide a list with individual keywords since a String.split() isn't enough: keyword_string = response.xpath( '/html/head/meta[@name="keywords"]/@content').get() kw_regex_split = re.split(r'\s*,\s*', keyword_string) general.add_value('keyword', kw_regex_split) lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', 'text/html') technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() lifecycle.add_value('date', last_modified) lifecycle.add_value('role', 'author') lifecycle.add_value('firstName', 'Dirk') lifecycle.add_value('lastName', 'Benkert') lifecycle.add_value('organization', 'Ginkgomaps') lifecycle.add_value('url', 'https://dirkbenkert.com/') lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() # since the learning objects are maps, expositive seems to be the best fit for interactivityType: educational.add_value('interactivityType', 'expositive') lom.add_value('educational', educational.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() # since no educationalContext is given, either hardcode these values or don't use them at all # vs.add_value('educationalContext', ["Sekundarstufe I", # "Sekundarstufe II", # "Berufliche Bildung", # "Erwachsenenbildung"]) vs.add_value('intendedEndUserRole', ["learner", "teacher", "parent"]) vs.add_value('discipline', 'Geografie') # Geografie vs.add_value('learningResourceType', 'map') # Karte vs.add_value('conditionsOfAccess', 'no login') lic = LicenseItemLoader() # if needed, the license description could also be gathered and constructed from multiple tags within a # container: /html/body/center/table[1]/tbody/tr[5]/td[2]/p license_url: str = response.xpath( '/html/body/center/table[1]/tr[5]/td[2]/p/a/@href').get() if (license_url is not None) and (license_url.endswith("deed.de")): license_url = license_url[:-len("deed.de")] license_url = license_url.replace("http://", "https://") lic.add_value('url', license_url) lic.add_value('author', response.xpath('/html/head/meta[3]/@content').get()) base.add_value('valuespaces', vs.load_item()) base.add_value('license', lic.load_item()) permissions = super().getPermissions(response) base.add_value('permissions', permissions.load_item()) base.add_value('response', super().mapResponse(response).load_item()) yield base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs): """ Populates a BaseItemLoader with metadata and yields the BaseItem afterwards. Scrapy Contracts: @url https://www.walter-fendt.de/html5/mde/pythagoras2_de.htm @returns items 1 """ # fetching publication date and lastModified from dynamically loaded <p class="Ende">-element: url_data_splash_dict = WebTools.getUrlData(response.url, engine=WebEngine.Pyppeteer) splash_html_string = url_data_splash_dict.get('html') page_end_element = Selector( text=splash_html_string).xpath('//p[@class="Ende"]').get() line_regex = re.compile(r'<br>') page_end_string = line_regex.split(page_end_element) published_date = None last_modified = None # the two strings inside the <p>-Container will look like this: # Walter Fendt, 2. November 2000 # Letzte Änderung: 17. Oktober 2017 # therefore we'll need to extract the dates by splitting up the strings for temp_string in page_end_string: if temp_string.startswith("Walter Fendt"): sentence1 = temp_string.rsplit(', ') # each "sentence" list now holds exactly 2 elements, whereby the last element should be the date for item in sentence1: if dateparser.parse(item) is not None: published_date = dateparser.parse(item) if temp_string.startswith('Letzte Änderung:'): sentence2 = temp_string.rsplit(': ') for item2 in sentence2: if dateparser.parse(item2) is not None: last_modified = dateparser.parse(item2) base = super().getBase(response=response) base.add_value('type', Constants.TYPE_MATERIAL) if last_modified is not None: hash_temp = last_modified.isoformat() + self.version base.add_value('hash', hash_temp) base.add_value('lastModified', last_modified.isoformat()) base.add_value('sourceId', response.url) lom = LomBaseItemloader() general = LomGeneralItemloader(response=response) general.add_value('identifier', response.url) general.add_value('title', response.xpath('/html/head/title/text()').get()) general.add_value( 'description', response.xpath( '/html/head/meta[@name="description"]/@content').get()) keywords_string: str = response.xpath( '/html/head/meta[@name="keywords"]/@content').get() if keywords_string is not None: keyword_list = keywords_string.rsplit(", ") general.add_value('keyword', keyword_list) general.add_value('language', 'de') lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', "text/html") technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() lifecycle.add_value('role', 'author') lifecycle.add_value('firstName', 'Walter') lifecycle.add_value('lastName', 'Fendt') lifecycle.add_value( 'url', "https://www.walter-fendt.de/wf.htm") # author information if published_date is not None: lifecycle.add_value('date', published_date.isoformat()) lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() educational.add_value('interactivityType', 'mixed') lom.add_value('educational', educational.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() vs.add_value('conditionsOfAccess', 'no login') vs.add_value('discipline', 'Mathematik') vs.add_value('intendedEndUserRole', ['learner', 'teacher', 'parent']) vs.add_value('learningResourceType', ['application', 'web page']) vs.add_value('price', 'no') base.add_value('valuespaces', vs.load_item()) lic = LicenseItemLoader() lic.add_value('author', 'Walter Fendt') # if scrapy could render the <p class="Ende">-element, the license url could be found with the following XPath: # license_url = response.xpath('//p[@class="Ende"]/a[@rel="license"]/@href') # but since scrapy can't "see" this container, we're extracting the information with scrapy-splash license_url: str = Selector(text=splash_html_string).xpath( '//p[@class="Ende"]/a[@rel="license"]/@href').get() if license_url is not None: if license_url.startswith("http://"): license_url = license_url.replace("http://", "https://") # the license url links to the /de/ version, which currently doesn't get mapped properly # "https://creativecommons.org/licenses/by-nc-sa/3.0/de/" # -> 'https://creativecommons.org/licenses/by-nc-sa/3.0/' is the url-format we want if "creativecommons.org/licenses/" in license_url and license_url.endswith( "/de/"): license_url = license_url.split("de/")[0] lic.add_value('url', license_url) base.add_value('license', lic.load_item()) permissions = super().getPermissions(response) base.add_value('permissions', permissions.load_item()) # TODO: fix super().mapResponse base.add_value('response', super().mapResponse(response).load_item()) yield base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs): """ Parses an individual 'worksheet' and combines the metadata with data from its 'bundle'-dictionary. Spider Contracts: @url https://editor.mnweg.org/mnw/dokument/vocabulary-around-the-world-3 @returns items 1 :return: yields a BaseItemLoader """ # since we're only parsing the first worksheet for some additional metadata, the metadata object will be # centered around a bundle, not the individual pages # print("DEBUG parse_worksheet_page", response.url) date_published = response.xpath( '//ul[@class="meta"]/li[3]/text()').get() base = BaseItemLoader() base.add_value("sourceId", kwargs.get('bundle_url')) hash_temp = str(date_published + self.version) base.add_value("hash", hash_temp) # this is a hacky solution: the thumbnail is the miniature preview of the bundle's first worksheet bundle_thumbnail = kwargs.get('bundle_thumbnail') if bundle_thumbnail is not None: base.add_value('thumbnail', bundle_thumbnail) base.add_value('type', Constants.TYPE_MATERIAL) base.add_value('lastModified', date_published) lom = LomBaseItemloader() general = LomGeneralItemloader() general.add_value('title', kwargs.get('bundle_title')) description_temp = str() bundle_desc_temp = kwargs.get('bundle_description') worksheet_desc_temp = kwargs.get('worksheet_description_summary') # not every bundle has a description, but there's always worksheet descriptions available: if bundle_desc_temp is not None: description_temp: str = bundle_desc_temp + "\n\n" + worksheet_desc_temp elif bundle_desc_temp is None and worksheet_desc_temp is not None: description_temp: str = worksheet_desc_temp # print(description_temp) general.add_value('description', description_temp) general.add_value('language', 'de') general.add_value('identifier', kwargs.get('bundle_url')) lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value("format", "text/html") technical.add_value('location', kwargs.get('bundle_url')) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() bundle_organization: dict = kwargs.get('bundle_ld_json_organization') # the dictionary that we can parse from the website itself looks like this: # 'organization': {'@context': 'http://schema.org', # '@type': 'Organization', # 'name': 'Materialnetzwerk e. G.', # 'sameAs': ['http://twitter.com/materialnw', # 'https://www.facebook.com/materialnetzwerk'], # 'url': 'https://editor.mnweg.org'}} # TODO: once its possible to parse a 'organization'-schema-type as a dictionary by the back-end, use # lifecycle.add_value('organization', bundle_organization) if bundle_organization is not None: lifecycle.add_value('organization', bundle_organization.get("name")) lifecycle.add_value('url', bundle_organization.get("url")) lifecycle.add_value('date', date_published) lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() # TODO: educationalLevel is currently unsupported in the items.py backend? educational_level = kwargs.get('bundle_educational_level') if educational_level is not None: educational.add_value('educationalLevel', educational_level) lom.add_value('educational', educational.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() vs.add_value('learningResourceType', 'teaching module') bundle_discipline = kwargs.get('bundle_discipline') if bundle_discipline is not None: if self.discipline_mapping.get(bundle_discipline) is not None: bundle_discipline = self.discipline_mapping.get( bundle_discipline) vs.add_value('discipline', bundle_discipline) vs.add_value('intendedEndUserRole', 'teacher') # logged in users can manipulate the worksheets and fit them to their needs, # but there's no login required for just downloading the pdf of an available worksheet vs.add_value('conditionsOfAccess', "login required for additional features") vs.add_value('price', 'no') # we can map "Phase" to our educationalContext with the following ValuespaceHelper method: if educational_level is not None: vs.add_value( "educationalContext", ValuespaceHelper.educationalContextByGrade(educational_level)) lic = LicenseItemLoader() # everything is CC-BY-SA 3.0 according to the FAQs: https://mnweg.org/faqs lic.add_value('url', Constants.LICENSE_CC_BY_SA_30) base.add_value('license', lic.load_item()) response_loader = ResponseItemLoader() response_loader.add_value('url', kwargs.get('bundle_url')) base.add_value('valuespaces', vs.load_item()) base.add_value('response', response_loader.load_item()) yield base.load_item()
def get_metadata_from_review_url(self, response: scrapy.http.Response, **kwargs): """ grabs metadata from the "material_review_url"-page and uses the wp_json_item from the "parse_page"-method to return a BaseItemLoader with the combined metadata from both sources. :param response: the scrapy.http.Response object for the currently parsed page :param kwargs: wp_json_item-dictionary """ # logging.debug("DEBUG inside get_metadata_from_review_url: wp_json_item id", kwargs.get("id")) wp_json_item = kwargs.get("item") # logging.debug("DEBUG inside get_metadata_from_review_url: response type = ", type(response), # "url =", response.url) ld_json_string = response.xpath('/html/head/script[@type="application/ld+json"]/text()').get().strip() ld_json_string = html.unescape(ld_json_string) ld_json = json.loads(ld_json_string) hash_temp: Optional[str] = None language_temp: Optional[str] = None pub_date: Optional[str] = None organization_id: Optional[str] = None organization_name: Optional[str] = None date_modified: Optional[str] = None # this is a workaround to make sure that we actually grab the following data, # no matter where they are positioned in the list: # - dateModified # - inLanguage # - datePublished # - organization_name and url # e.g.: since there seems to be fluctuation how many elements the "@graph"-Array holds, we can't be sure # which position "dateModified" actually has: # sometimes it's ld_json.get("@graph")[2], sometimes on [3] etc., therefore we must check all of them ld_graph_items = ld_json.get("@graph") for item in ld_graph_items: if item.get("dateModified") is not None: date_modified = item.get("dateModified") # this can be used instead of 'date' in lastModified hash_temp = item.get("dateModified") + self.version if item.get("@type") == "WebSite": language_temp = item.get("inLanguage") if item.get("@type") == "WebPage": pub_date = item.get("datePublished") if item.get("@type") == "Organization": organization_id = item.get("@id") organization_name = item.get("name") base = BaseItemLoader() base.add_value("sourceId", response.url) base.add_value("hash", hash_temp) # base.add_value("response", super().mapResponse(response).load_item()) base.add_value("type", Constants.TYPE_MATERIAL) # TODO: is this correct? use mapping for edu-context? base.add_value("thumbnail", wp_json_item.get("material_screenshot")) # base.add_value("lastModified", wp_json_item.get("date")) # is "date" from wp_json for lastModified correct? base.add_value("lastModified", date_modified) # or is this one better (grabbed from from material_review_url)? lom = LomBaseItemloader() general = LomGeneralItemloader(response=response) general.add_value("title", wp_json_item.get("material_titel")) # the source material heavily fluctuates between perfectly fine strings and messy (hardcoded) html tags # as well as "\n" and "\t", therefore we need to clean up that String first: raw_description = wp_json_item.get("material_beschreibung") raw_description = w3lib.html.remove_tags(raw_description) raw_description = w3lib.html.strip_html5_whitespace(raw_description) clean_description = w3lib.html.replace_escape_chars(raw_description) general.add_value("description", clean_description) general.add_value("identifier", wp_json_item.get("id")) if language_temp is not None: general.add_value("language", language_temp) kw_temp = list() for item in wp_json_item.get("material_schlagworte"): kw_temp.append(item.get("name")) general.add_value("keyword", kw_temp) lom.add_value("general", general.load_item()) technical = LomTechnicalItemLoader() technical.add_value("format", "text/html") technical.add_value("location", wp_json_item.get("material_review_url")) lom.add_value("technical", technical.load_item()) lifecycle = LomLifecycleItemloader() if organization_name is not None: lifecycle.add_value("organization", organization_name) if organization_id is not None: lifecycle.add_value("url", organization_id) if pub_date is not None: lifecycle.add_value("date", pub_date) lom.add_value("lifecycle", lifecycle.load_item()) educational = LomEducationalItemLoader() if wp_json_item.get("material_altersstufe") is not None: # age range is returned as a list of <from_age>-<to_age>-Strings, possible return values are: # e.g. "01-05", "05-10", "10-13", "13-15", "15-19" and "18-99" age_regex = re.compile(r'(\d{1,2})-(\d{1,2})') age_range = set() age_range_item_loader = LomAgeRangeItemLoader() for item in wp_json_item.get("material_altersstufe"): age_range_temp = item.get("name") age_from = str(age_regex.search(age_range_temp).group(1)) age_to = str(age_regex.search(age_range_temp).group(2)) age_range.add(age_from) age_range.add(age_to) # print("FINAL AGE_RANGE: min = ", min(age_range), " max = ", max(age_range)) if len(age_range) != 0: age_range_item_loader.add_value("fromRange", min(age_range)) age_range_item_loader.add_value("toRange", max(age_range)) educational.add_value("typicalAgeRange", age_range_item_loader.load_item()) lom.add_value("educational", educational.load_item()) base.add_value("lom", lom.load_item()) vs = ValuespaceItemLoader() vs.add_value("discipline", "http://w3id.org/openeduhub/vocabs/discipline/520") # Religion # mapping educationalContext educational_context = list() for edu_con_item in wp_json_item.get("material_bildungsstufe"): educational_context.append(edu_con_item.get("name")) for edu_item in educational_context: if edu_item in self.mapping_edu_context.keys(): edu_item = self.mapping_edu_context.get(edu_item) if edu_item != "": vs.add_value("educationalContext", edu_item) # using mapped media_type_list for valuespaces -> learningResourceType media_type_list = list() for item in wp_json_item.get("material_medientyp"): media_type_list.append(item.get("name")) for media_type_item in media_type_list: if media_type_item in self.mapping_media_types.keys(): media_type_item = self.mapping_media_types.get(media_type_item) if media_type_item != "": vs.add_value("learningResourceType", media_type_item) # see: https://vocabs.openeduhub.de/w3id.org/openeduhub/vocabs/learningResourceType/index.html # there's metadata for "Kompetenzen" (e.g.: "Deuten", "Gestalten", "Reflexion") within the returned wp_json # that our data-model doesn't support yet. for future reference though: # wp_json_item.get("material_kompetenzen") -> list vs.add_value("intendedEndUserRole", "teacher") lic = LicenseItemLoader() license_regex_nc_reuse = re.compile(r'Zur nicht kommerziellen Wiederverwendung gekennzeichnet') license_regex_nc_reuse_and_change = re.compile( r'Zur nicht kommerziellen Wiederverwendung und Veränderung gekennzeichnet') # important clarification from rpi-virtuell: # 'frei zugänglich' describes 'ungeklärte Lizenz' / 'volles Urheberrecht' # CC licenses > 'frei zugänglich' if both values are found in the license description license_regex_free_access = re.compile(r'frei zugänglich') license_regex_free_after_signup = re.compile(r'kostenfrei nach Anmeldung') license_regex_with_costs = re.compile(r'kostenpflichtig') license_description = response.xpath('//div[@class="material-detail-meta-access material-meta"]' '/div[@class="material-meta-content-entry"]/text()').get() if license_description is not None: license_description = html.unescape(license_description.strip()) lic.add_value("description", license_description) cc_by_nc_nd = license_regex_nc_reuse.search(license_description) cc_by_nc_sa = license_regex_nc_reuse_and_change.search(license_description) # if the RegEx search finds something, it returns a match-object. otherwise by default it returns None if cc_by_nc_nd is not None: lic.add_value("url", Constants.LICENSE_CC_BY_NC_ND_40) if cc_by_nc_sa is not None: lic.add_value("url", Constants.LICENSE_CC_BY_NC_SA_30) # if a material is "frei zugänglich", set price to none, but don't override a previously set CC-license if license_regex_free_access.search(license_description) is not None: vs.add_value("price", "no") # only if "frei zugänglich" is the only license-description this will trigger: # see https://rpi-virtuell.de/nutzungsbedingungen/ (5.) if license_regex_free_access.match(license_description) is not None: lic.add_value("url", Constants.LICENSE_CC_BY_SA_40) if license_regex_with_costs.search(license_description): lic.add_value("internal", Constants.LICENSE_COPYRIGHT_LAW) vs.add_value("price", "yes") if license_regex_free_after_signup.search(license_description): vs.add_value("price", "yes") vs.add_value("conditionsOfAccess", "login") else: # by default, all materials should be CC_BY_SA - according to the rpi-virtuell ToS lic.replace_value("url", Constants.LICENSE_CC_BY_SA_40) authors = list() # the author should end up in LOM lifecycle, but the returned metadata are too messily formatted to parse them # by easy patterns like (first name) + (last name) for item in wp_json_item.get("material_autoren"): if item.get("name") is not None: if item.get("name").strip() != "": authors.append(item.get("name")) lic.add_value("author", authors) base.add_value("valuespaces", vs.load_item()) base.add_value("license", lic.load_item()) permissions = super().getPermissions(response) base.add_value("permissions", permissions.load_item()) response_loader = ResponseItemLoader() response_loader.add_value("url", response.url) base.add_value("response", response_loader.load_item()) yield base.load_item()
def parse(self, response: scrapy.http.Response, **kwargs): """ Parses an individual topic url for metadata and yields a BaseItem. Scrapy Contracts: @url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/ @returns item 1 """ current_url: str = response.url base = BaseItemLoader() base.add_value('sourceId', response.url) date_raw: str = response.xpath( '//div[@class="b-cpsuiu-show-info"]/span/text()').get() date_cleaned_up: str = w3lib.html.strip_html5_whitespace(date_raw) hash_temp = str(date_cleaned_up + self.version) base.add_value('hash', hash_temp) base.add_value('lastModified', date_cleaned_up) base.add_value('type', Constants.TYPE_MATERIAL) # base.add_value('thumbnail', thumbnail_url) lom = LomBaseItemloader() general = LomGeneralItemloader() general.add_value('identifier', response.url) title: str = response.xpath( '//div[@class="tx-cps-uiu"]/article/h1/text()').get() general.add_value('title', title) keywords: list = response.xpath( '//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall() if len(keywords) >= 1: # only add keywords if the list isn't empty general.add_value('keyword', keywords) description: str = response.xpath( '/html/head/meta[@name="description"]/@content').get() general.add_value('description', description) general.add_value('language', 'de') lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() technical.add_value('format', 'text/html') technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() lifecycle.add_value('role', 'publisher') lifecycle.add_value('date', date_cleaned_up) lifecycle.add_value('url', "https://www.umwelt-im-unterricht.de/impressum/") lifecycle.add_value( 'organization', 'Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit (BMU)' ) lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() educational.add_value('language', 'de') # TODO: a didactic comment could fit into either one of these: # - educational.description # - classification.description (with classification.purpose set to 'educational objective') if "/wochenthemen/" in current_url: # didactic comments are only part of "Thema der Woche" didactic_comment = response.xpath( '//div[@class="c-collapse-content js-collapse-content"]').get( ) if didactic_comment is not None: didactic_comment = w3lib.html.remove_tags(didactic_comment) # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment, which_ones='\t', replace_by=" ") # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment) didactic_comment = " ".join(didactic_comment.split()) if didactic_comment.endswith("mehr lesenweniger lesen"): # the button-description of the expandable info-box ends up in the string, # therefore we are manually removing it: didactic_comment = didactic_comment.replace( "mehr lesenweniger lesen", "") # since there's currently no way to confirm how the string looks in the web-interface: # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars) educational.add_value('description', didactic_comment) lom.add_value('educational', educational.load_item()) classification = LomClassificationItemLoader() if "/unterrichtsvorschlaege/" in current_url: classification.add_value('purpose', 'competency') competency_description: list = response.xpath( '//div[@class="b-cpsuiu-show-description"]/*[not(' '@class="cc-licence-info")]').getall() # the xpath-expression for competency_description will grab the whole div-element, # but EXCLUDE the "license"-container (if the license-description exists, it's always part of the same div) if len(competency_description) >= 1: # only if the list of strings is not empty, we'll try to type-convert it to a string (and clean its # formatting up) competency_description: str = " ".join(competency_description) competency_description = w3lib.html.remove_tags( competency_description) classification.add_value('description', competency_description) lom.add_value('classification', classification.load_item()) base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() # depending on the website-category, we need to set a specific learningResourceType # because the value 'website' for all crawled items would not be helpful enough if "/wochenthemen/" in current_url or "/unterrichtsvorschlaege/" in current_url: vs.add_value('learningResourceType', 'lesson plan') if "/hintergrund/" in current_url: vs.add_value('learningResourceType', 'Text') if "/medien/dateien/" in current_url: # topics categorized as "Arbeitsmaterial" offer customizable worksheets to teachers vs.add_value('learningResourceType', 'worksheet') if "/medien/videos/" in current_url: vs.add_value('learningResourceType', 'video') if "/medien/bilder/" in current_url: # topics categorized as "Bilderserie" hold several images in a gallery (with individual licenses) vs.add_value('learningResourceType', 'image') vs.add_value('price', 'no') vs.add_value('containsAdvertisement', 'no') vs.add_value('conditionsOfAccess', 'no login') vs.add_value('intendedEndUserRole', 'teacher') # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/ vs.add_value('accessibilitySummary', 'Not tested') # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/ vs.add_value('dataProtectionConformity', 'Sensible data collection') # see: https://www.umwelt-im-unterricht.de/datenschutz/ disciplines_raw: list = response.xpath( '//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall() if len(disciplines_raw) >= 1: disciplines = list() for discipline_value in disciplines_raw: # self.debug_discipline_values.add(discipline_value) if discipline_value in self.DISCIPLINE_MAPPING.keys(): discipline_value = self.DISCIPLINE_MAPPING.get( discipline_value) # since the mapping value can either be a single string OR a list of strings, we need to make sure that # our 'disciplines'-list is a list of strings (not a list with nested lists): if type(discipline_value) is list: disciplines.extend(discipline_value) else: disciplines.append(discipline_value) if len(disciplines) >= 1: vs.add_value('discipline', disciplines) educational_context_raw = response.xpath( '//div[@class="b-cpsuiu-show-targets"]/ul/li/a/text()').getall() if len(educational_context_raw) >= 1: # the educationalContext-mapping is only done when there's at least one educational_context found educational_context = list() for educational_context_value in educational_context_raw: # self.debug_educational_context_values.add(educational_context_value) if educational_context_value in self.EDUCATIONAL_CONTEXT_MAPPING.keys( ): educational_context_value = self.EDUCATIONAL_CONTEXT_MAPPING.get( educational_context_value) if type(educational_context_value) is list: educational_context.extend(educational_context_value) else: educational_context.append(educational_context_value) if len(educational_context) >= 1: vs.add_value('educationalContext', educational_context) base.add_value('valuespaces', vs.load_item()) lic = LicenseItemLoader() license_url: str = response.xpath( '//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get() if license_url is not None: if license_url.startswith("http://"): # the license-mapper expects urls that are in https:// format, but UIU uses http:// links to CC-licenses license_url = license_url.replace("http://", "https://") lic.add_value('url', license_url) license_description_raw: str = response.xpath( '//div[@class="cc-licence-info"]').get() if license_description_raw is not None: license_description_raw = w3lib.html.remove_tags( license_description_raw) license_description_raw = w3lib.html.replace_escape_chars( license_description_raw, which_ones="\n", replace_by=" ") # if we would replace_escape_chars() straight away, there would be words stuck together that don't belong # together. just replacing \n with a whitespace is enough to keep the structure of the string intact. license_description_raw = w3lib.html.replace_escape_chars( license_description_raw) license_description = " ".join(license_description_raw.split()) # making sure that there's only 1 whitespace between words lic.add_value('description', license_description) base.add_value('license', lic.load_item()) permissions = super().getPermissions(response) base.add_value('permissions', permissions.load_item()) response_loader = super().mapResponse(response) base.add_value('response', response_loader.load_item()) yield base.load_item()