예제 #1
0
    def parse(self, response: scrapy.http.Response):
        print("Parsing URL: " + response.url)

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data

        # We would use .fromstring(response.text) if the response did not include the XML declaration:
        # <?xml version="1.0" encoding="utf-8"?>
        root = etree.XML(response.body)
        tree = etree.ElementTree(root)

        # If results are returned.
        elements = tree.xpath("/root/items/*")
        if len(elements) > 0:
            for element in elements:
                copyResponse = response.copy()
                element_xml_str = etree.tostring(element,
                                                 pretty_print=True,
                                                 encoding="unicode")
                element_dict = xmltodict.parse(element_xml_str)

                # Temporary solution for public-only content.
                # TODO: remove this when licensed content are enabled!
                if not self.is_public(element_dict["data"]):
                    continue

                # TODO: It's probably a pointless attribute.
                # del element_dict["data"]["score"]

                # Passing the dictionary for easier access to attributes.
                copyResponse.meta["item"] = element_dict["data"]

                # In case JSON string representation is preferred:
                # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False))
                copyResponse._set_body(element_xml_str)

                if self.hasChanged(copyResponse):
                    yield self.handleEntry(copyResponse)

                # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
                LomBase.parse(self, copyResponse)

        # TODO: To not stress the Rest APIs.
        # time.sleep(0.1)

        # If the number of returned results is equal to the imposed limit, it means that there are more to be returned.
        if len(elements) == self.limit:
            self.page += 1
            url = self.apiUrl.replace("%start",
                                      str(self.page * self.limit)).replace(
                                          "%anzahl", str(self.limit))
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers={
                    "Accept": "application/xml",
                    "Content-Type": "application/xml",
                },
            )
    def parse(self, response: scrapy.http.Response):

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data
        elements = json.loads(response.body_as_unicode())

        # grouped_elements = self.group_elements_by_medium_id(elements)
        grouped_elements = self.group_elements_by_sammlung(elements)

        for i, element in enumerate(grouped_elements):
            copyResponse = response.copy()

            # Passing the dictionary for easier access to attributes.
            copyResponse.meta["item"] = element

            # In case JSON string representation is preferred:
            json_str = json.dumps(element,
                                  indent=4,
                                  sort_keys=True,
                                  ensure_ascii=False)
            copyResponse._set_body(json_str)
            print(json_str)

            if self.hasChanged(copyResponse):
                yield self.handleEntry(copyResponse)

            # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
            LomBase.parse(self, copyResponse)
예제 #3
0
    def parse(self, response: scrapy.http.Response):
        elements = json.loads(response.body_as_unicode())
        prepared_elements = [
            self.prepare_element(element_dict) for element_dict in elements
        ]

        collection_elements = self.prepare_collections(prepared_elements)

        for i, element_dict in enumerate(collection_elements):

            copyResponse = response.copy()

            # Passing the dictionary for easier access to attributes.
            copyResponse.meta["item"] = element_dict

            # In case JSON string representation is preferred:
            json_str = json.dumps(element_dict,
                                  indent=4,
                                  sort_keys=True,
                                  ensure_ascii=False)
            copyResponse._set_body(json_str)

            if self.hasChanged(copyResponse):
                yield self.handleEntry(copyResponse)

            # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
            LomBase.parse(self, copyResponse)
예제 #4
0
 def getBase(self, response):
     base = LomBase.getBase(self, response)
     base.add_value(
         "lastModified",
         response.meta["item"].xpath("letzte_aenderung//text()").get(),
     )
     return base
예제 #5
0
    def getLOMGeneral(self, response):
        general = LomBase.getLOMGeneral(self, response)
        general.add_value("title", response.xpath("/data/titel/text()").get())
        general.add_value("description",
                          response.xpath("/data/beschreibung/text()").get())

        return general
    def getValuespaces(self, response):
        valuespaces = LomBase.getValuespaces(self, response)
        discipline = list(
            map(
                lambda x: x["value"],
                self.get("acf.fachgebiet", json=response.meta["item"]),
            )
        )
        valuespaces.add_value("discipline", discipline)
        lernresourcentyp = self.get("acf.lernresourcentyp", json=response.meta["item"])
        if lernresourcentyp:
            lernresourcentyp = list(map(lambda x: x["value"], lernresourcentyp))
            valuespaces.add_value("sourceContentType", lernresourcentyp)
        category = self.get("acf.category", json=response.meta["item"])
        if category:
            category = list(map(lambda x: x["value"], category))
            valuespaces.add_value("toolCategory", category)

        context = list(
            map(
                lambda x: x["value"],
                self.get("acf.schulform", json=response.meta["item"]),
            )
        )
        valuespaces.add_value("educationalContext", context)
        role = list(
            map(lambda x: x["value"], self.get("acf.role", json=response.meta["item"]))
        )
        valuespaces.add_value("intendedEndUserRole", role)
        return valuespaces
 def getLOMTechnical(self, response):
     technical = LomBase.getLOMTechnical(self, response)
     technical.replace_value("format", "text/html")
     technical.replace_value(
         "location", self.get("acf.url", json=response.meta["item"])
     )
     return technical
 def parseEntry(self, response):
     if self.get("language", response=response) == "de":
         return LomBase.parse(self, response)
     logging.info(
         "Skpping entry with language " + self.get("language", response=response)
     )
     return None
예제 #9
0
    def getBase(self, response):
        base = LomBase.getBase(self, response)

        # Element response as a Python dict.
        element_dict = dict(response.meta["item"])

        base.add_value("thumbnail", element_dict.get("thumbnail",
                                                     ""))  # get or default

        # As a backup, if no other thumbnail URL is available.
        element_dict["hardcodedDefaultLogoUrl"] = "/logos/bs_logos/merlin.png"

        # By the order of preference. As soon as one of these default thumbnails is available you keep that.
        for default_thumbnail in [
                "srcLogoUrl", "logo", "hardcodedDefaultLogoUrl"
        ]:
            if default_thumbnail in element_dict:
                base.add_value(
                    "defaultThumbnail", "https://merlin.nibis.de" +
                    element_dict[default_thumbnail])
                break

        # Adding a default searchable value to constitute this element (node) as a valid-to-be-returned object.
        base.add_value("searchable", "1")

        return base
예제 #10
0
 def getLOMEducational(self, response):
     educational = LomBase.getLOMEducational(self, response)
     educational.add_value(
         'description',
         HTMLParser().unescape(
             self.get('acf.short_text', json=response.meta['item'])))
     return educational
예제 #11
0
 def getLOMGeneral(self, response):
   general = LomBase.getLOMGeneral(self, response)
   general.add_value('identifier', self.get('id', response = response))
   general.add_value('title', self.get('title', response = response))
   general.add_value('keyword', self.get('keywords', response = response))
   general.add_value('language', self.get('language', response = response))
   return general
예제 #12
0
 def getValuespaces(self, response):
     valuespaces = LomBase.getValuespaces(self, response)
     valuespaces.add_value(
         "learningResourceType",
         self.getLRMI("learningResourceType", response=response),
     )
     return valuespaces
예제 #13
0
 def getValuespaces(self, response):
     valuespaces = LomBase.getValuespaces(self, response)
     valuespaces.add_value(
         "intendedEndUserRole",
         self.getLRMI("audience.educationalRole", response=response),
     )
     return valuespaces
    def getBase(self, response):
        base = LomBase.getBase(self, response)
        base.replace_value("thumbnail",
                           response.meta["item"]["preview"]["url"])
        base.replace_value("origin",
                           self.getProperty("ccm:replicationsource", response))
        if self.getProperty("ccm:replicationsource", response):
            # imported objects usually have the content as binary text
            # TODO: Sometimes, edu-sharing redirects if no local content is found, and this should be html-parsed
            try:
                r = requests.get(response.meta["item"]["downloadUrl"])
                if r.status_code == 200:
                    base.replace_value("fulltext", r.text)
            except:
                logging.warning(
                    "error fetching data from " +
                    response.meta["item"]["downloadUrl"],
                    sys.exc_info()[0],
                )
        else:
            # try to transform using alfresco
            r = requests.get(
                self.apiUrl + "/node/v1/nodes/" +
                response.meta["item"]["ref"]["repo"] + "/" +
                response.meta["item"]["ref"]["id"] + "/textContent",
                headers={
                    "Accept": "application/json"
                },
            ).json()
            if "text" in r:
                base.replace_value("fulltext", r["text"])

        return base
예제 #15
0
 def getLOMTechnical(self, response):
     technical = LomBase.getLOMTechnical(self, response)
     technical.add_value("format", "text/html")
     technical.add_value(
         "location",
         response.meta["item"].xpath("url_datensatz//text()").get())
     return technical
예제 #16
0
 def getValuespaces(self, response):
   valuespaces = LomBase.getValuespaces(self, response)
   text = response.meta['item'].xpath('systematikpfad//text()').get()
   for entry in ProcessValuespacePipeline.valuespaces['discipline']:
     if len(list(filter(lambda x:x['@value'].casefold() in text.casefold(), entry['label']))):
       valuespaces.add_value('discipline',entry['id'])
   return valuespaces
예제 #17
0
 def getLOMGeneral(self, response):
     general = LomBase.getLOMGeneral(self, response)
     general.add_value("title", response.meta["item"].get("Name").strip())
     general.add_value(
         "keyword",
         list(
             filter(
                 lambda x: x,
                 map(
                     lambda x: x.strip(),
                     response.xpath(
                         '//*[@id="ContentModuleApp"]//*[@class="topic-name"]//text()'
                     ).getall(),
                 ),
             )),
     )
     description = "\n".join(
         list(
             filter(
                 lambda x: x,
                 map(
                     lambda x: x.strip(),
                     response.xpath(
                         '//*[@id="ContentModuleApp"]//*[@content-module-type="inlinetext"]//p//text()'
                     ).getall(),
                 ),
             ))).strip()
     general.add_value("description", description)
     return general
예제 #18
0
    def getValuespaces(self, response):
        valuespaces = LomBase.getValuespaces(self, response=response)
        text = self.get("categories", response=response)[0].split("/")[0]
        # manual mapping to Mathematik
        if text == "Mathe":
            text = "Mathematik"
        valuespaces.add_value("discipline", text)
        # for entry in ProcessValuespacePipeline.valuespaces['discipline']:
        #  if len(list(filter(lambda x:x['@value'].casefold() == text.casefold(), entry['label']))):
        #    valuespaces.add_value('discipline',entry['id'])

        primarySchool = re.compile("Klasse\s[1-4]$", re.IGNORECASE)
        if len(
                list(
                    filter(lambda x: primarySchool.match(x),
                           self.getKeywords(response)))):
            valuespaces.add_value("educationalContext", "Grundschule")
        sek1 = re.compile("Klasse\s([5-9]|10)$", re.IGNORECASE)
        if len(
                list(
                    filter(lambda x: sek1.match(x),
                           self.getKeywords(response)))):
            valuespaces.add_value("educationalContext", "Sekundarstufe 1")
        sek2 = re.compile("Klasse\s1[1-2]", re.IGNORECASE)
        if len(
                list(
                    filter(lambda x: sek2.match(x),
                           self.getKeywords(response)))):
            valuespaces.add_value("educationalContext", "Sekundarstufe 2")
        return valuespaces
예제 #19
0
 def getValuespaces(self, response):
     valuespaces = LomBase.getValuespaces(self, response)
     text = response.meta["item"].xpath("systematikpfad//text()").get()
     for entry in self.valuespacesMapping.data["discipline"]:
         if entry["prefLabel"]["de"].casefold() in text.casefold():
             valuespaces.add_value("discipline", entry["id"])
     return valuespaces
예제 #20
0
 def getLOMGeneral(self, response):
     general = LomBase.getLOMGeneral(self, response=response)
     general.add_value("title", self.get("title", response=response))
     general.add_value("keyword", self.getKeywords(response))
     general.add_value("description",
                       self.get("description", response=response))
     return general
예제 #21
0
 def mapResponse(self, response):
     r = LomBase.mapResponse(self, response)
     text = r.load_item()["text"].split(
         "Dieses Werk steht unter der freien Lizenz CC BY-SA 4.0 Information"
     )[0]
     r.replace_value("text", text)
     return r
예제 #22
0
 def getLOMGeneral(self, response):
     general = LomBase.getLOMGeneral(self, response)
     general.add_value('title', response.xpath('//title//text()').get())
     general.add_value(
         'language',
         response.xpath('//meta[@property="og:locale"]/@content').get())
     return general
예제 #23
0
    def getLOMEducational(self, response):
        response.selector.remove_namespaces()
        record = response.xpath('//OAI-PMH/GetRecord/record')

        educational = LomBase.getLOMEducational(response)
        #TODO put in general description
        educational.add_value(
            'description',
            record.xpath('metadata/lom/general/description/string//text()').
            extract_first())
        tarString = record.xpath(
            'metadata/lom/educational/typicalAgeRange/string//text()'
        ).extract_first()
        if tarString:
            tar = LomAgeRangeItemLoader()
            tarSplitted = tarString.split('-')
            if len(tarSplitted) > 1:
                tar.add_value('fromRange', tarSplitted[0])
                tar.add_value('toRange', tarSplitted[1])
                educational.add_value('typicalAgeRange', tar.load_item())
            else:
                self.logger.info('unknown agerange %s', tarString)
        educational.add_value(
            'language',
            record.xpath(
                'metadata/lom/educational/language//text()').extract_first())
        return educational
예제 #24
0
 def getBase(self, response):
     base = LomBase.getBase(self, response)
     thumb = response.xpath('//meta[@property="og:image"]//@content').get()
     if thumb:
         base.add_value("thumbnail",
                        self.url + thumb.replace("_350", "_1000"))
     # base.add_value('thumbnail', self.url + '/Images/Categories/' + str(self.getId(response)) + '_1000.jpg')
     return base
예제 #25
0
    def getPermissions(self, response):
        permissions = LomBase.getPermissions(self, response)

        permissions.replace_value("public", False)
        permissions.add_value("autoCreateGroups", True)
        permissions.add_value("groups", ["public"])

        return permissions
예제 #26
0
 def getBase(self, response: Response) -> items.BaseItemLoader:
     base = LomBase.getBase(self, response)
     base.add_value("origin", response.meta["row"]["sourceTitle"].strip())
     base.add_value("lastModified",
                    response.meta["item"]["snippet"]["publishedAt"])
     base.add_value("thumbnail", self.getThumbnailUrl(response))
     base.add_value("fulltext", self.getFulltext(response))
     return base
    def getLOMTechnical(self, response):
        technical = LomBase.getLOMTechnical(self, response)

        technical.add_value("format", "text/html")
        technical.add_value("location", self.getUri(response))
        technical.add_value("size", len(response.body))

        return technical
예제 #28
0
 def getLOMEducational(self, response):
   educational = LomBase.getLOMEducational(self, response)
   desc = response.meta['item'].xpath('beschreibung//text()').get().strip()
   # dirty cleaning of invalid descriptions
   # not perfect yet, these objects also appear inside the content
   if not desc.startswith('swiffyobject_'):
     educational.add_value('description', HTMLParser().unescape(desc))
   return educational
예제 #29
0
 def getLOMLifecycle(self,
                     response: Response) -> items.LomLifecycleItemloader:
     lifecycle = LomBase.getLOMLifecycle(self, response)
     lifecycle.add_value("role", "author")
     lifecycle.add_value("organization",
                         response.meta["item"]["snippet"]["channelTitle"])
     lifecycle.add_value("url", self.getChannelUrl(response))
     return lifecycle
예제 #30
0
 def getValuespaces(self, response):
     valuespaces = LomBase.getValuespaces(self, response)
     valuespaces.add_value("educationalContext", "sekundarstufe_2")
     valuespaces.add_value("educationalContext", "berufliche_bildung")
     valuespaces.add_value("educationalContext", "erwachsenenbildung")
     valuespaces.add_value("discipline", "700")  # Wirtschaftskunde
     valuespaces.add_value("discipline", "48005")  # Gesellschaftskunde
     return valuespaces