Python LomEducationalItemLoader.add_value示例

编程语言: Python

命名空间/包名称: converter.items

方法/功能: add_value

hotexamples.com的示例: 5

Python LomEducationalItemLoader.add_value - 已找到5个示例。这些是从开源项目中提取的最受好评的converter.items.LomEducationalItemLoader.add_value现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

LomEducationalItemLoader(10)

load_item(10)

add_value(5)

示例#1

显示文件

文件： ginkgomaps_spider.py 项目： openeduhub/oeh-search-etl

    def parse(self, response: scrapy.http.Response, **kwargs):
        """

        Scrapy Contracts:
        @url http://ginkgomaps.com/landkarten_deutschland.html
        @returns items 1
        """
        # making sure that the current url is marked as parsed:
        self.debug_parsed_urls.add(response.url)

        # IMPORTANT: modern browsers add "tbody"-elements into tables, scrapy doesn't see those tags!
        #   Remember: whatever request you see with the developer tools in your browser, you need to manually remove
        #   ANY <tbody>-tag that sits inside your xpath expression, otherwise it will return an empty [] !
        #       response.xpath('/html/body/center/table[1]/tr[4]/td[3]/table[1]').get()

        # first index page contains 42 maps, all inside tables of the class "smalltable":
        # response.xpath('//table[@class="smalltable"]')

        table_body = response.xpath('//table[@class="smalltable"]')
        description_temp = str()
        first_thumbnail = str()
        if table_body is not None:
            for table_item in table_body:
                # print(table_item.get())
                map_title = table_item.xpath('tr/td[1]/a[2]/text()').get()
                map_design_heading = table_item.xpath(
                    'tr/td[2]/u[1]/text()').get()
                map_design = table_item.xpath('tr/td[2]/p[1]/text()').get()
                map_content_heading = table_item.xpath(
                    'tr/td[2]/u[2]/text()').get()
                map_content = table_item.xpath('tr/td[2]/p[2]/text()').get()
                # map_thumbnail = response.urljoin(table_item.xpath('tr/td[1]/a[1]/img/@src').get())
                # map_thumbnail_description = table_item.xpath('tr/td[1]/a[1]/img/@alt').get()

                # pdf_download_url = response.urljoin(table_item.xpath('tr/td[2]/p[3]/a[1]/@href').get())
                # pdf_download_title = table_item.xpath('tr/td[2]/p[3]/a[2]/text()').get()
                # jpeg_download_medium_url = response.urljoin(table_item.xpath('tr/td[2]/p[4]/a[2]/@href').get())
                # jpeg_download_medium_description = table_item.xpath('tr/td[2]/p[4]/a[2]/text()').get()
                # jpeg_download_high_url = response.urljoin(table_item.xpath('tr/td[2]/p[5]/a[2]/@href').get())
                # jpeg_download_high_description = table_item.xpath('tr/td[2]/p[5]/a[2]/text()').get()

                description_temp += map_title + "\n" \
                    + map_design_heading + map_design \
                    + map_content_heading + map_content
            # while we could theoretically grab all thumbnails during the above loop,
            # the first one is enough for a preview-image in edu-sharing
            first_thumbnail = response.urljoin(
                table_body[0].xpath('tr/td[1]/a[1]/img/@src').get())

        description_temp = w3lib.html.strip_html5_whitespace(description_temp)

        base = super().getBase(response=response)
        base.add_value('sourceId', response.url)

        last_modified = response.xpath('/html/head/meta[6]/@content').get()
        hash_temp = last_modified + self.version
        base.add_value('hash', hash_temp)
        base.add_value('type', Constants.TYPE_MATERIAL)
        if first_thumbnail is not None:
            base.add_value('thumbnail', first_thumbnail)
        base.add_value('lastModified', last_modified)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        general.add_value('language', 'de')
        general.add_value('identifier', response.url)
        # the description could be extended with additional infos about the map-formats and their resolutions,
        # (if necessary)
        general.add_value('description', description_temp)
        general.add_value('title',
                          response.xpath('/html/head/title/text()').get())
        # keywords are stored inside a String, separated by commas with (sometimes multiple) whitespaces,
        # therefore RegEx is needed to provide a list with individual keywords since a String.split() isn't enough:
        keyword_string = response.xpath(
            '/html/head/meta[@name="keywords"]/@content').get()
        kw_regex_split = re.split(r'\s*,\s*', keyword_string)
        general.add_value('keyword', kw_regex_split)
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', 'text/html')
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('date', last_modified)
        lifecycle.add_value('role', 'author')
        lifecycle.add_value('firstName', 'Dirk')
        lifecycle.add_value('lastName', 'Benkert')
        lifecycle.add_value('organization', 'Ginkgomaps')
        lifecycle.add_value('url', 'https://dirkbenkert.com/')
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        # since the learning objects are maps, expositive seems to be the best fit for interactivityType:
        educational.add_value('interactivityType', 'expositive')
        lom.add_value('educational', educational.load_item())
        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        # since no educationalContext is given, either hardcode these values or don't use them at all
        # vs.add_value('educationalContext', ["Sekundarstufe I",
        #                                     "Sekundarstufe II",
        #                                     "Berufliche Bildung",
        #                                     "Erwachsenenbildung"])
        vs.add_value('intendedEndUserRole', ["learner", "teacher", "parent"])
        vs.add_value('discipline', 'Geografie')  # Geografie
        vs.add_value('learningResourceType', 'map')  # Karte
        vs.add_value('conditionsOfAccess', 'no login')

        lic = LicenseItemLoader()
        # if needed, the license description could also be gathered and constructed from multiple tags within a
        # container: /html/body/center/table[1]/tbody/tr[5]/td[2]/p
        license_url: str = response.xpath(
            '/html/body/center/table[1]/tr[5]/td[2]/p/a/@href').get()
        if (license_url is not None) and (license_url.endswith("deed.de")):
            license_url = license_url[:-len("deed.de")]
            license_url = license_url.replace("http://", "https://")
            lic.add_value('url', license_url)
        lic.add_value('author',
                      response.xpath('/html/head/meta[3]/@content').get())

        base.add_value('valuespaces', vs.load_item())
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value('permissions', permissions.load_item())

        base.add_value('response', super().mapResponse(response).load_item())

        yield base.load_item()

示例#2

显示文件

文件： zum_mathe_apps_spider.py 项目： openeduhub/oeh-search-etl

    def parse(self, response: scrapy.http.Response, **kwargs):
        """
        Populates a BaseItemLoader with metadata and yields the BaseItem afterwards.

        Scrapy Contracts:
        @url https://www.walter-fendt.de/html5/mde/pythagoras2_de.htm
        @returns items 1
        """
        # fetching publication date and lastModified from dynamically loaded <p class="Ende">-element:
        url_data_splash_dict = WebTools.getUrlData(response.url,
                                                   engine=WebEngine.Pyppeteer)
        splash_html_string = url_data_splash_dict.get('html')
        page_end_element = Selector(
            text=splash_html_string).xpath('//p[@class="Ende"]').get()
        line_regex = re.compile(r'<br>')
        page_end_string = line_regex.split(page_end_element)
        published_date = None
        last_modified = None
        # the two strings inside the <p>-Container will look like this:
        # Walter Fendt, 2. November 2000
        # Letzte Änderung: 17. Oktober 2017
        # therefore we'll need to extract the dates by splitting up the strings
        for temp_string in page_end_string:
            if temp_string.startswith("Walter Fendt"):
                sentence1 = temp_string.rsplit(', ')
                # each "sentence" list now holds exactly 2 elements, whereby the last element should be the date
                for item in sentence1:
                    if dateparser.parse(item) is not None:
                        published_date = dateparser.parse(item)
            if temp_string.startswith('Letzte Änderung:'):
                sentence2 = temp_string.rsplit(': ')
                for item2 in sentence2:
                    if dateparser.parse(item2) is not None:
                        last_modified = dateparser.parse(item2)

        base = super().getBase(response=response)
        base.add_value('type', Constants.TYPE_MATERIAL)
        if last_modified is not None:
            hash_temp = last_modified.isoformat() + self.version
            base.add_value('hash', hash_temp)
            base.add_value('lastModified', last_modified.isoformat())
        base.add_value('sourceId', response.url)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        general.add_value('identifier', response.url)
        general.add_value('title',
                          response.xpath('/html/head/title/text()').get())
        general.add_value(
            'description',
            response.xpath(
                '/html/head/meta[@name="description"]/@content').get())
        keywords_string: str = response.xpath(
            '/html/head/meta[@name="keywords"]/@content').get()
        if keywords_string is not None:
            keyword_list = keywords_string.rsplit(", ")
            general.add_value('keyword', keyword_list)
        general.add_value('language', 'de')
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', "text/html")
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('role', 'author')
        lifecycle.add_value('firstName', 'Walter')
        lifecycle.add_value('lastName', 'Fendt')
        lifecycle.add_value(
            'url', "https://www.walter-fendt.de/wf.htm")  # author information
        if published_date is not None:
            lifecycle.add_value('date', published_date.isoformat())
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        educational.add_value('interactivityType', 'mixed')
        lom.add_value('educational', educational.load_item())

        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value('conditionsOfAccess', 'no login')
        vs.add_value('discipline', 'Mathematik')
        vs.add_value('intendedEndUserRole', ['learner', 'teacher', 'parent'])
        vs.add_value('learningResourceType', ['application', 'web page'])
        vs.add_value('price', 'no')
        base.add_value('valuespaces', vs.load_item())

        lic = LicenseItemLoader()
        lic.add_value('author', 'Walter Fendt')
        # if scrapy could render the <p class="Ende">-element, the license url could be found with the following XPath:
        # license_url = response.xpath('//p[@class="Ende"]/a[@rel="license"]/@href')
        # but since scrapy can't "see" this container, we're extracting the information with scrapy-splash
        license_url: str = Selector(text=splash_html_string).xpath(
            '//p[@class="Ende"]/a[@rel="license"]/@href').get()
        if license_url is not None:
            if license_url.startswith("http://"):
                license_url = license_url.replace("http://", "https://")
            # the license url links to the /de/ version, which currently doesn't get mapped properly
            # "https://creativecommons.org/licenses/by-nc-sa/3.0/de/"
            # -> 'https://creativecommons.org/licenses/by-nc-sa/3.0/' is the url-format we want
            if "creativecommons.org/licenses/" in license_url and license_url.endswith(
                    "/de/"):
                license_url = license_url.split("de/")[0]
            lic.add_value('url', license_url)
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value('permissions', permissions.load_item())

        # TODO: fix super().mapResponse
        base.add_value('response', super().mapResponse(response).load_item())

        yield base.load_item()

示例#3

显示文件

文件： materialnetzwerk_spider.py 项目： openeduhub/oeh-search-etl

    def parse(self, response: scrapy.http.Response, **kwargs):
        """
        Parses an individual 'worksheet' and combines the metadata with data from its 'bundle'-dictionary.

        Spider Contracts:
        @url https://editor.mnweg.org/mnw/dokument/vocabulary-around-the-world-3
        @returns items 1

        :return: yields a BaseItemLoader
        """
        # since we're only parsing the first worksheet for some additional metadata, the metadata object will be
        # centered around a bundle, not the individual pages

        # print("DEBUG parse_worksheet_page", response.url)
        date_published = response.xpath(
            '//ul[@class="meta"]/li[3]/text()').get()

        base = BaseItemLoader()
        base.add_value("sourceId", kwargs.get('bundle_url'))
        hash_temp = str(date_published + self.version)
        base.add_value("hash", hash_temp)
        # this is a hacky solution: the thumbnail is the miniature preview of the bundle's first worksheet
        bundle_thumbnail = kwargs.get('bundle_thumbnail')
        if bundle_thumbnail is not None:
            base.add_value('thumbnail', bundle_thumbnail)
        base.add_value('type', Constants.TYPE_MATERIAL)
        base.add_value('lastModified', date_published)

        lom = LomBaseItemloader()
        general = LomGeneralItemloader()
        general.add_value('title', kwargs.get('bundle_title'))

        description_temp = str()
        bundle_desc_temp = kwargs.get('bundle_description')
        worksheet_desc_temp = kwargs.get('worksheet_description_summary')
        # not every bundle has a description, but there's always worksheet descriptions available:
        if bundle_desc_temp is not None:
            description_temp: str = bundle_desc_temp + "\n\n" + worksheet_desc_temp
        elif bundle_desc_temp is None and worksheet_desc_temp is not None:
            description_temp: str = worksheet_desc_temp
        # print(description_temp)
        general.add_value('description', description_temp)
        general.add_value('language', 'de')
        general.add_value('identifier', kwargs.get('bundle_url'))
        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value("format", "text/html")
        technical.add_value('location', kwargs.get('bundle_url'))
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        bundle_organization: dict = kwargs.get('bundle_ld_json_organization')
        # the dictionary that we can parse from the website itself looks like this:
        # 'organization': {'@context': 'http://schema.org',
        #                   '@type': 'Organization',
        #                   'name': 'Materialnetzwerk e. G.',
        #                   'sameAs': ['http://twitter.com/materialnw',
        #                              'https://www.facebook.com/materialnetzwerk'],
        #                   'url': 'https://editor.mnweg.org'}}
        # TODO: once its possible to parse a 'organization'-schema-type as a dictionary by the back-end, use
        #   lifecycle.add_value('organization', bundle_organization)
        if bundle_organization is not None:
            lifecycle.add_value('organization',
                                bundle_organization.get("name"))
            lifecycle.add_value('url', bundle_organization.get("url"))
        lifecycle.add_value('date', date_published)
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        # TODO: educationalLevel is currently unsupported in the items.py backend?
        educational_level = kwargs.get('bundle_educational_level')
        if educational_level is not None:
            educational.add_value('educationalLevel', educational_level)
        lom.add_value('educational', educational.load_item())
        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value('learningResourceType', 'teaching module')
        bundle_discipline = kwargs.get('bundle_discipline')
        if bundle_discipline is not None:
            if self.discipline_mapping.get(bundle_discipline) is not None:
                bundle_discipline = self.discipline_mapping.get(
                    bundle_discipline)
            vs.add_value('discipline', bundle_discipline)
        vs.add_value('intendedEndUserRole', 'teacher')
        #  logged in users can manipulate the worksheets and fit them to their needs,
        #  but there's no login required for just downloading the pdf of an available worksheet
        vs.add_value('conditionsOfAccess',
                     "login required for additional features")
        vs.add_value('price', 'no')
        # we can map "Phase" to our educationalContext with the following ValuespaceHelper method:
        if educational_level is not None:
            vs.add_value(
                "educationalContext",
                ValuespaceHelper.educationalContextByGrade(educational_level))

        lic = LicenseItemLoader()
        # everything is CC-BY-SA 3.0 according to the FAQs: https://mnweg.org/faqs
        lic.add_value('url', Constants.LICENSE_CC_BY_SA_30)
        base.add_value('license', lic.load_item())

        response_loader = ResponseItemLoader()
        response_loader.add_value('url', kwargs.get('bundle_url'))

        base.add_value('valuespaces', vs.load_item())
        base.add_value('response', response_loader.load_item())

        yield base.load_item()

示例#4

显示文件

    def get_metadata_from_review_url(self, response: scrapy.http.Response, **kwargs):
        """
        grabs metadata from the "material_review_url"-page and uses the wp_json_item from the
        "parse_page"-method to return a BaseItemLoader with the combined metadata from both sources.

        :param response: the scrapy.http.Response object for the currently parsed page
        :param kwargs: wp_json_item-dictionary
        """
        # logging.debug("DEBUG inside get_metadata_from_review_url: wp_json_item id", kwargs.get("id"))
        wp_json_item = kwargs.get("item")
        # logging.debug("DEBUG inside get_metadata_from_review_url: response type = ", type(response),
        #               "url =", response.url)

        ld_json_string = response.xpath('/html/head/script[@type="application/ld+json"]/text()').get().strip()
        ld_json_string = html.unescape(ld_json_string)

        ld_json = json.loads(ld_json_string)

        hash_temp: Optional[str] = None
        language_temp: Optional[str] = None
        pub_date: Optional[str] = None
        organization_id: Optional[str] = None
        organization_name: Optional[str] = None
        date_modified: Optional[str] = None
        # this is a workaround to make sure that we actually grab the following data,
        # no matter where they are positioned in the list:
        #   - dateModified
        #   - inLanguage
        #   - datePublished
        #   - organization_name and url
        # e.g.: since there seems to be fluctuation how many elements the "@graph"-Array holds, we can't be sure
        # which position "dateModified" actually has:
        # sometimes it's ld_json.get("@graph")[2], sometimes on [3] etc., therefore we must check all of them
        ld_graph_items = ld_json.get("@graph")
        for item in ld_graph_items:
            if item.get("dateModified") is not None:
                date_modified = item.get("dateModified")  # this can be used instead of 'date' in lastModified
                hash_temp = item.get("dateModified") + self.version
            if item.get("@type") == "WebSite":
                language_temp = item.get("inLanguage")
            if item.get("@type") == "WebPage":
                pub_date = item.get("datePublished")
            if item.get("@type") == "Organization":
                organization_id = item.get("@id")
                organization_name = item.get("name")

        base = BaseItemLoader()
        base.add_value("sourceId", response.url)
        base.add_value("hash", hash_temp)

        # base.add_value("response", super().mapResponse(response).load_item())

        base.add_value("type", Constants.TYPE_MATERIAL)  # TODO: is this correct? use mapping for edu-context?
        base.add_value("thumbnail", wp_json_item.get("material_screenshot"))
        # base.add_value("lastModified", wp_json_item.get("date"))  # is "date" from wp_json for lastModified correct?
        base.add_value("lastModified", date_modified)  # or is this one better (grabbed from from material_review_url)?

        lom = LomBaseItemloader()
        general = LomGeneralItemloader(response=response)
        general.add_value("title", wp_json_item.get("material_titel"))

        # the source material heavily fluctuates between perfectly fine strings and messy (hardcoded) html tags
        # as well as "\n" and "\t", therefore we need to clean up that String first:
        raw_description = wp_json_item.get("material_beschreibung")
        raw_description = w3lib.html.remove_tags(raw_description)
        raw_description = w3lib.html.strip_html5_whitespace(raw_description)
        clean_description = w3lib.html.replace_escape_chars(raw_description)
        general.add_value("description", clean_description)

        general.add_value("identifier", wp_json_item.get("id"))
        if language_temp is not None:
            general.add_value("language", language_temp)

        kw_temp = list()
        for item in wp_json_item.get("material_schlagworte"):
            kw_temp.append(item.get("name"))
        general.add_value("keyword", kw_temp)
        lom.add_value("general", general.load_item())

        technical = LomTechnicalItemLoader()

        technical.add_value("format", "text/html")
        technical.add_value("location", wp_json_item.get("material_review_url"))
        lom.add_value("technical", technical.load_item())

        lifecycle = LomLifecycleItemloader()
        if organization_name is not None:
            lifecycle.add_value("organization", organization_name)
        if organization_id is not None:
            lifecycle.add_value("url", organization_id)
        if pub_date is not None:
            lifecycle.add_value("date", pub_date)

        lom.add_value("lifecycle", lifecycle.load_item())

        educational = LomEducationalItemLoader()

        if wp_json_item.get("material_altersstufe") is not None:
            # age range is returned as a list of <from_age>-<to_age>-Strings, possible return values are:
            # e.g. "01-05", "05-10", "10-13", "13-15", "15-19" and "18-99"
            age_regex = re.compile(r'(\d{1,2})-(\d{1,2})')
            age_range = set()
            age_range_item_loader = LomAgeRangeItemLoader()
            for item in wp_json_item.get("material_altersstufe"):
                age_range_temp = item.get("name")
                age_from = str(age_regex.search(age_range_temp).group(1))
                age_to = str(age_regex.search(age_range_temp).group(2))
                age_range.add(age_from)
                age_range.add(age_to)
            # print("FINAL AGE_RANGE: min = ", min(age_range), " max = ", max(age_range))
            if len(age_range) != 0:
                age_range_item_loader.add_value("fromRange", min(age_range))
                age_range_item_loader.add_value("toRange", max(age_range))
                educational.add_value("typicalAgeRange", age_range_item_loader.load_item())

        lom.add_value("educational", educational.load_item())
        base.add_value("lom", lom.load_item())

        vs = ValuespaceItemLoader()
        vs.add_value("discipline", "http://w3id.org/openeduhub/vocabs/discipline/520")  # Religion
        # mapping educationalContext
        educational_context = list()
        for edu_con_item in wp_json_item.get("material_bildungsstufe"):
            educational_context.append(edu_con_item.get("name"))
        for edu_item in educational_context:
            if edu_item in self.mapping_edu_context.keys():
                edu_item = self.mapping_edu_context.get(edu_item)
            if edu_item != "":
                vs.add_value("educationalContext", edu_item)

        # using mapped media_type_list for valuespaces -> learningResourceType
        media_type_list = list()
        for item in wp_json_item.get("material_medientyp"):
            media_type_list.append(item.get("name"))
        for media_type_item in media_type_list:
            if media_type_item in self.mapping_media_types.keys():
                media_type_item = self.mapping_media_types.get(media_type_item)
            if media_type_item != "":
                vs.add_value("learningResourceType", media_type_item)
        # see: https://vocabs.openeduhub.de/w3id.org/openeduhub/vocabs/learningResourceType/index.html

        # there's metadata for "Kompetenzen" (e.g.: "Deuten", "Gestalten", "Reflexion") within the returned wp_json
        # that our data-model doesn't support yet. for future reference though:
        #   wp_json_item.get("material_kompetenzen") -> list

        vs.add_value("intendedEndUserRole", "teacher")

        lic = LicenseItemLoader()

        license_regex_nc_reuse = re.compile(r'Zur nicht kommerziellen Wiederverwendung gekennzeichnet')
        license_regex_nc_reuse_and_change = re.compile(
            r'Zur nicht kommerziellen Wiederverwendung und Veränderung gekennzeichnet')

        # important clarification from rpi-virtuell:
        #   'frei zugänglich' describes 'ungeklärte Lizenz' / 'volles Urheberrecht'
        #   CC licenses > 'frei zugänglich' if both values are found in the license description
        license_regex_free_access = re.compile(r'frei zugänglich')
        license_regex_free_after_signup = re.compile(r'kostenfrei nach Anmeldung')
        license_regex_with_costs = re.compile(r'kostenpflichtig')

        license_description = response.xpath('//div[@class="material-detail-meta-access material-meta"]'
                                             '/div[@class="material-meta-content-entry"]/text()').get()

        if license_description is not None:
            license_description = html.unescape(license_description.strip())
            lic.add_value("description", license_description)

            cc_by_nc_nd = license_regex_nc_reuse.search(license_description)
            cc_by_nc_sa = license_regex_nc_reuse_and_change.search(license_description)
            # if the RegEx search finds something, it returns a match-object. otherwise by default it returns None
            if cc_by_nc_nd is not None:
                lic.add_value("url", Constants.LICENSE_CC_BY_NC_ND_40)
            if cc_by_nc_sa is not None:
                lic.add_value("url", Constants.LICENSE_CC_BY_NC_SA_30)
            # if a material is "frei zugänglich", set price to none, but don't override a previously set CC-license
            if license_regex_free_access.search(license_description) is not None:
                vs.add_value("price", "no")
                # only if "frei zugänglich" is the only license-description this will trigger:
                # see https://rpi-virtuell.de/nutzungsbedingungen/ (5.)
                if license_regex_free_access.match(license_description) is not None:
                    lic.add_value("url", Constants.LICENSE_CC_BY_SA_40)
            if license_regex_with_costs.search(license_description):
                lic.add_value("internal", Constants.LICENSE_COPYRIGHT_LAW)
                vs.add_value("price", "yes")
            if license_regex_free_after_signup.search(license_description):
                vs.add_value("price", "yes")
                vs.add_value("conditionsOfAccess", "login")
        else:
            # by default, all materials should be CC_BY_SA - according to the rpi-virtuell ToS
            lic.replace_value("url", Constants.LICENSE_CC_BY_SA_40)
        authors = list()
        # the author should end up in LOM lifecycle, but the returned metadata are too messily formatted to parse them
        # by easy patterns like (first name) + (last name)
        for item in wp_json_item.get("material_autoren"):
            if item.get("name") is not None:
                if item.get("name").strip() != "":
                    authors.append(item.get("name"))
        lic.add_value("author", authors)

        base.add_value("valuespaces", vs.load_item())

        base.add_value("license", lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value("permissions", permissions.load_item())

        response_loader = ResponseItemLoader()
        response_loader.add_value("url", response.url)
        base.add_value("response", response_loader.load_item())

        yield base.load_item()

示例#5

显示文件

    def parse(self, response: scrapy.http.Response, **kwargs):
        """
        Parses an individual topic url for metadata and yields a BaseItem.

        Scrapy Contracts:
        @url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/
        @returns item 1
        """
        current_url: str = response.url
        base = BaseItemLoader()

        base.add_value('sourceId', response.url)
        date_raw: str = response.xpath(
            '//div[@class="b-cpsuiu-show-info"]/span/text()').get()
        date_cleaned_up: str = w3lib.html.strip_html5_whitespace(date_raw)
        hash_temp = str(date_cleaned_up + self.version)
        base.add_value('hash', hash_temp)
        base.add_value('lastModified', date_cleaned_up)
        base.add_value('type', Constants.TYPE_MATERIAL)
        # base.add_value('thumbnail', thumbnail_url)

        lom = LomBaseItemloader()

        general = LomGeneralItemloader()
        general.add_value('identifier', response.url)
        title: str = response.xpath(
            '//div[@class="tx-cps-uiu"]/article/h1/text()').get()
        general.add_value('title', title)
        keywords: list = response.xpath(
            '//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall()
        if len(keywords) >= 1:
            # only add keywords if the list isn't empty
            general.add_value('keyword', keywords)
        description: str = response.xpath(
            '/html/head/meta[@name="description"]/@content').get()
        general.add_value('description', description)
        general.add_value('language', 'de')

        lom.add_value('general', general.load_item())

        technical = LomTechnicalItemLoader()
        technical.add_value('format', 'text/html')
        technical.add_value('location', response.url)
        lom.add_value('technical', technical.load_item())

        lifecycle = LomLifecycleItemloader()
        lifecycle.add_value('role', 'publisher')
        lifecycle.add_value('date', date_cleaned_up)
        lifecycle.add_value('url',
                            "https://www.umwelt-im-unterricht.de/impressum/")
        lifecycle.add_value(
            'organization',
            'Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit (BMU)'
        )
        lom.add_value('lifecycle', lifecycle.load_item())

        educational = LomEducationalItemLoader()
        educational.add_value('language', 'de')

        # TODO: a didactic comment could fit into either one of these:
        #  - educational.description
        #  - classification.description (with classification.purpose set to 'educational objective')
        if "/wochenthemen/" in current_url:
            # didactic comments are only part of "Thema der Woche"
            didactic_comment = response.xpath(
                '//div[@class="c-collapse-content js-collapse-content"]').get(
                )
            if didactic_comment is not None:
                didactic_comment = w3lib.html.remove_tags(didactic_comment)
                # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment, which_ones='\t', replace_by=" ")
                # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment)
                didactic_comment = " ".join(didactic_comment.split())
                if didactic_comment.endswith("mehr lesenweniger lesen"):
                    # the button-description of the expandable info-box ends up in the string,
                    # therefore we are manually removing it:
                    didactic_comment = didactic_comment.replace(
                        "mehr lesenweniger lesen", "")
                # since there's currently no way to confirm how the string looks in the web-interface:
                # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars)
                educational.add_value('description', didactic_comment)

        lom.add_value('educational', educational.load_item())

        classification = LomClassificationItemLoader()
        if "/unterrichtsvorschlaege/" in current_url:
            classification.add_value('purpose', 'competency')
            competency_description: list = response.xpath(
                '//div[@class="b-cpsuiu-show-description"]/*[not('
                '@class="cc-licence-info")]').getall()
            # the xpath-expression for competency_description will grab the whole div-element,
            # but EXCLUDE the "license"-container (if the license-description exists, it's always part of the same div)
            if len(competency_description) >= 1:
                # only if the list of strings is not empty, we'll try to type-convert it to a string (and clean its
                # formatting up)
                competency_description: str = " ".join(competency_description)
                competency_description = w3lib.html.remove_tags(
                    competency_description)
                classification.add_value('description', competency_description)

        lom.add_value('classification', classification.load_item())
        base.add_value('lom', lom.load_item())

        vs = ValuespaceItemLoader()

        # depending on the website-category, we need to set a specific learningResourceType
        # because the value 'website' for all crawled items would not be helpful enough
        if "/wochenthemen/" in current_url or "/unterrichtsvorschlaege/" in current_url:
            vs.add_value('learningResourceType', 'lesson plan')
        if "/hintergrund/" in current_url:
            vs.add_value('learningResourceType', 'Text')
        if "/medien/dateien/" in current_url:
            # topics categorized as "Arbeitsmaterial" offer customizable worksheets to teachers
            vs.add_value('learningResourceType', 'worksheet')
        if "/medien/videos/" in current_url:
            vs.add_value('learningResourceType', 'video')
        if "/medien/bilder/" in current_url:
            # topics categorized as "Bilderserie" hold several images in a gallery (with individual licenses)
            vs.add_value('learningResourceType', 'image')

        vs.add_value('price', 'no')
        vs.add_value('containsAdvertisement', 'no')
        vs.add_value('conditionsOfAccess', 'no login')
        vs.add_value('intendedEndUserRole', 'teacher')
        # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/
        vs.add_value('accessibilitySummary', 'Not tested')
        # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/
        vs.add_value('dataProtectionConformity', 'Sensible data collection')
        # see: https://www.umwelt-im-unterricht.de/datenschutz/

        disciplines_raw: list = response.xpath(
            '//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
        if len(disciplines_raw) >= 1:
            disciplines = list()
            for discipline_value in disciplines_raw:
                # self.debug_discipline_values.add(discipline_value)
                if discipline_value in self.DISCIPLINE_MAPPING.keys():
                    discipline_value = self.DISCIPLINE_MAPPING.get(
                        discipline_value)
                # since the mapping value can either be a single string OR a list of strings, we need to make sure that
                # our 'disciplines'-list is a list of strings (not a list with nested lists):
                if type(discipline_value) is list:
                    disciplines.extend(discipline_value)
                else:
                    disciplines.append(discipline_value)
            if len(disciplines) >= 1:
                vs.add_value('discipline', disciplines)

        educational_context_raw = response.xpath(
            '//div[@class="b-cpsuiu-show-targets"]/ul/li/a/text()').getall()
        if len(educational_context_raw) >= 1:
            # the educationalContext-mapping is only done when there's at least one educational_context found
            educational_context = list()
            for educational_context_value in educational_context_raw:
                # self.debug_educational_context_values.add(educational_context_value)
                if educational_context_value in self.EDUCATIONAL_CONTEXT_MAPPING.keys(
                ):
                    educational_context_value = self.EDUCATIONAL_CONTEXT_MAPPING.get(
                        educational_context_value)
                if type(educational_context_value) is list:
                    educational_context.extend(educational_context_value)
                else:
                    educational_context.append(educational_context_value)
            if len(educational_context) >= 1:
                vs.add_value('educationalContext', educational_context)

        base.add_value('valuespaces', vs.load_item())

        lic = LicenseItemLoader()
        license_url: str = response.xpath(
            '//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get()
        if license_url is not None:
            if license_url.startswith("http://"):
                # the license-mapper expects urls that are in https:// format, but UIU uses http:// links to CC-licenses
                license_url = license_url.replace("http://", "https://")
            lic.add_value('url', license_url)

        license_description_raw: str = response.xpath(
            '//div[@class="cc-licence-info"]').get()
        if license_description_raw is not None:
            license_description_raw = w3lib.html.remove_tags(
                license_description_raw)
            license_description_raw = w3lib.html.replace_escape_chars(
                license_description_raw, which_ones="\n", replace_by=" ")
            # if we would replace_escape_chars() straight away, there would be words stuck together that don't belong
            # together. just replacing \n with a whitespace is enough to keep the structure of the string intact.
            license_description_raw = w3lib.html.replace_escape_chars(
                license_description_raw)
            license_description = " ".join(license_description_raw.split())
            # making sure that there's only 1 whitespace between words
            lic.add_value('description', license_description)
        base.add_value('license', lic.load_item())

        permissions = super().getPermissions(response)
        base.add_value('permissions', permissions.load_item())

        response_loader = super().mapResponse(response)
        base.add_value('response', response_loader.load_item())

        yield base.load_item()