def parse(self, response: scrapy.http.Response): print("Parsing URL: " + response.url) # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data # We would use .fromstring(response.text) if the response did not include the XML declaration: # <?xml version="1.0" encoding="utf-8"?> root = etree.XML(response.body) tree = etree.ElementTree(root) # If results are returned. elements = tree.xpath("/root/items/*") if len(elements) > 0: for element in elements: copyResponse = response.copy() element_xml_str = etree.tostring(element, pretty_print=True, encoding="unicode") element_dict = xmltodict.parse(element_xml_str) # Temporary solution for public-only content. # TODO: remove this when licensed content are enabled! if not self.is_public(element_dict["data"]): continue # TODO: It's probably a pointless attribute. # del element_dict["data"]["score"] # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element_dict["data"] # In case JSON string representation is preferred: # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False)) copyResponse._set_body(element_xml_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse) # TODO: To not stress the Rest APIs. # time.sleep(0.1) # If the number of returned results is equal to the imposed limit, it means that there are more to be returned. if len(elements) == self.limit: self.page += 1 url = self.apiUrl.replace("%start", str(self.page * self.limit)).replace( "%anzahl", str(self.limit)) yield scrapy.Request( url=url, callback=self.parse, headers={ "Accept": "application/xml", "Content-Type": "application/xml", }, )
def parse(self, response: scrapy.http.Response): # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data elements = json.loads(response.body_as_unicode()) # grouped_elements = self.group_elements_by_medium_id(elements) grouped_elements = self.group_elements_by_sammlung(elements) for i, element in enumerate(grouped_elements): copyResponse = response.copy() # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element # In case JSON string representation is preferred: json_str = json.dumps(element, indent=4, sort_keys=True, ensure_ascii=False) copyResponse._set_body(json_str) print(json_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse)
def parse(self, response: scrapy.http.Response): elements = json.loads(response.body_as_unicode()) prepared_elements = [ self.prepare_element(element_dict) for element_dict in elements ] collection_elements = self.prepare_collections(prepared_elements) for i, element_dict in enumerate(collection_elements): copyResponse = response.copy() # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element_dict # In case JSON string representation is preferred: json_str = json.dumps(element_dict, indent=4, sort_keys=True, ensure_ascii=False) copyResponse._set_body(json_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse)
def getBase(self, response): base = LomBase.getBase(self, response) base.add_value( "lastModified", response.meta["item"].xpath("letzte_aenderung//text()").get(), ) return base
def getLOMGeneral(self, response): general = LomBase.getLOMGeneral(self, response) general.add_value("title", response.xpath("/data/titel/text()").get()) general.add_value("description", response.xpath("/data/beschreibung/text()").get()) return general
def getValuespaces(self, response): valuespaces = LomBase.getValuespaces(self, response) discipline = list( map( lambda x: x["value"], self.get("acf.fachgebiet", json=response.meta["item"]), ) ) valuespaces.add_value("discipline", discipline) lernresourcentyp = self.get("acf.lernresourcentyp", json=response.meta["item"]) if lernresourcentyp: lernresourcentyp = list(map(lambda x: x["value"], lernresourcentyp)) valuespaces.add_value("sourceContentType", lernresourcentyp) category = self.get("acf.category", json=response.meta["item"]) if category: category = list(map(lambda x: x["value"], category)) valuespaces.add_value("toolCategory", category) context = list( map( lambda x: x["value"], self.get("acf.schulform", json=response.meta["item"]), ) ) valuespaces.add_value("educationalContext", context) role = list( map(lambda x: x["value"], self.get("acf.role", json=response.meta["item"])) ) valuespaces.add_value("intendedEndUserRole", role) return valuespaces
def getLOMTechnical(self, response): technical = LomBase.getLOMTechnical(self, response) technical.replace_value("format", "text/html") technical.replace_value( "location", self.get("acf.url", json=response.meta["item"]) ) return technical
def parseEntry(self, response): if self.get("language", response=response) == "de": return LomBase.parse(self, response) logging.info( "Skpping entry with language " + self.get("language", response=response) ) return None
def getBase(self, response): base = LomBase.getBase(self, response) # Element response as a Python dict. element_dict = dict(response.meta["item"]) base.add_value("thumbnail", element_dict.get("thumbnail", "")) # get or default # As a backup, if no other thumbnail URL is available. element_dict["hardcodedDefaultLogoUrl"] = "/logos/bs_logos/merlin.png" # By the order of preference. As soon as one of these default thumbnails is available you keep that. for default_thumbnail in [ "srcLogoUrl", "logo", "hardcodedDefaultLogoUrl" ]: if default_thumbnail in element_dict: base.add_value( "defaultThumbnail", "https://merlin.nibis.de" + element_dict[default_thumbnail]) break # Adding a default searchable value to constitute this element (node) as a valid-to-be-returned object. base.add_value("searchable", "1") return base
def getLOMEducational(self, response): educational = LomBase.getLOMEducational(self, response) educational.add_value( 'description', HTMLParser().unescape( self.get('acf.short_text', json=response.meta['item']))) return educational
def getLOMGeneral(self, response): general = LomBase.getLOMGeneral(self, response) general.add_value('identifier', self.get('id', response = response)) general.add_value('title', self.get('title', response = response)) general.add_value('keyword', self.get('keywords', response = response)) general.add_value('language', self.get('language', response = response)) return general
def getValuespaces(self, response): valuespaces = LomBase.getValuespaces(self, response) valuespaces.add_value( "learningResourceType", self.getLRMI("learningResourceType", response=response), ) return valuespaces
def getValuespaces(self, response): valuespaces = LomBase.getValuespaces(self, response) valuespaces.add_value( "intendedEndUserRole", self.getLRMI("audience.educationalRole", response=response), ) return valuespaces
def getBase(self, response): base = LomBase.getBase(self, response) base.replace_value("thumbnail", response.meta["item"]["preview"]["url"]) base.replace_value("origin", self.getProperty("ccm:replicationsource", response)) if self.getProperty("ccm:replicationsource", response): # imported objects usually have the content as binary text # TODO: Sometimes, edu-sharing redirects if no local content is found, and this should be html-parsed try: r = requests.get(response.meta["item"]["downloadUrl"]) if r.status_code == 200: base.replace_value("fulltext", r.text) except: logging.warning( "error fetching data from " + response.meta["item"]["downloadUrl"], sys.exc_info()[0], ) else: # try to transform using alfresco r = requests.get( self.apiUrl + "/node/v1/nodes/" + response.meta["item"]["ref"]["repo"] + "/" + response.meta["item"]["ref"]["id"] + "/textContent", headers={ "Accept": "application/json" }, ).json() if "text" in r: base.replace_value("fulltext", r["text"]) return base
def getLOMTechnical(self, response): technical = LomBase.getLOMTechnical(self, response) technical.add_value("format", "text/html") technical.add_value( "location", response.meta["item"].xpath("url_datensatz//text()").get()) return technical
def getValuespaces(self, response): valuespaces = LomBase.getValuespaces(self, response) text = response.meta['item'].xpath('systematikpfad//text()').get() for entry in ProcessValuespacePipeline.valuespaces['discipline']: if len(list(filter(lambda x:x['@value'].casefold() in text.casefold(), entry['label']))): valuespaces.add_value('discipline',entry['id']) return valuespaces
def getLOMGeneral(self, response): general = LomBase.getLOMGeneral(self, response) general.add_value("title", response.meta["item"].get("Name").strip()) general.add_value( "keyword", list( filter( lambda x: x, map( lambda x: x.strip(), response.xpath( '//*[@id="ContentModuleApp"]//*[@class="topic-name"]//text()' ).getall(), ), )), ) description = "\n".join( list( filter( lambda x: x, map( lambda x: x.strip(), response.xpath( '//*[@id="ContentModuleApp"]//*[@content-module-type="inlinetext"]//p//text()' ).getall(), ), ))).strip() general.add_value("description", description) return general
def getValuespaces(self, response): valuespaces = LomBase.getValuespaces(self, response=response) text = self.get("categories", response=response)[0].split("/")[0] # manual mapping to Mathematik if text == "Mathe": text = "Mathematik" valuespaces.add_value("discipline", text) # for entry in ProcessValuespacePipeline.valuespaces['discipline']: # if len(list(filter(lambda x:x['@value'].casefold() == text.casefold(), entry['label']))): # valuespaces.add_value('discipline',entry['id']) primarySchool = re.compile("Klasse\s[1-4]$", re.IGNORECASE) if len( list( filter(lambda x: primarySchool.match(x), self.getKeywords(response)))): valuespaces.add_value("educationalContext", "Grundschule") sek1 = re.compile("Klasse\s([5-9]|10)$", re.IGNORECASE) if len( list( filter(lambda x: sek1.match(x), self.getKeywords(response)))): valuespaces.add_value("educationalContext", "Sekundarstufe 1") sek2 = re.compile("Klasse\s1[1-2]", re.IGNORECASE) if len( list( filter(lambda x: sek2.match(x), self.getKeywords(response)))): valuespaces.add_value("educationalContext", "Sekundarstufe 2") return valuespaces
def getValuespaces(self, response): valuespaces = LomBase.getValuespaces(self, response) text = response.meta["item"].xpath("systematikpfad//text()").get() for entry in self.valuespacesMapping.data["discipline"]: if entry["prefLabel"]["de"].casefold() in text.casefold(): valuespaces.add_value("discipline", entry["id"]) return valuespaces
def getLOMGeneral(self, response): general = LomBase.getLOMGeneral(self, response=response) general.add_value("title", self.get("title", response=response)) general.add_value("keyword", self.getKeywords(response)) general.add_value("description", self.get("description", response=response)) return general
def mapResponse(self, response): r = LomBase.mapResponse(self, response) text = r.load_item()["text"].split( "Dieses Werk steht unter der freien Lizenz CC BY-SA 4.0 Information" )[0] r.replace_value("text", text) return r
def getLOMGeneral(self, response): general = LomBase.getLOMGeneral(self, response) general.add_value('title', response.xpath('//title//text()').get()) general.add_value( 'language', response.xpath('//meta[@property="og:locale"]/@content').get()) return general
def getLOMEducational(self, response): response.selector.remove_namespaces() record = response.xpath('//OAI-PMH/GetRecord/record') educational = LomBase.getLOMEducational(response) #TODO put in general description educational.add_value( 'description', record.xpath('metadata/lom/general/description/string//text()'). extract_first()) tarString = record.xpath( 'metadata/lom/educational/typicalAgeRange/string//text()' ).extract_first() if tarString: tar = LomAgeRangeItemLoader() tarSplitted = tarString.split('-') if len(tarSplitted) > 1: tar.add_value('fromRange', tarSplitted[0]) tar.add_value('toRange', tarSplitted[1]) educational.add_value('typicalAgeRange', tar.load_item()) else: self.logger.info('unknown agerange %s', tarString) educational.add_value( 'language', record.xpath( 'metadata/lom/educational/language//text()').extract_first()) return educational
def getBase(self, response): base = LomBase.getBase(self, response) thumb = response.xpath('//meta[@property="og:image"]//@content').get() if thumb: base.add_value("thumbnail", self.url + thumb.replace("_350", "_1000")) # base.add_value('thumbnail', self.url + '/Images/Categories/' + str(self.getId(response)) + '_1000.jpg') return base
def getPermissions(self, response): permissions = LomBase.getPermissions(self, response) permissions.replace_value("public", False) permissions.add_value("autoCreateGroups", True) permissions.add_value("groups", ["public"]) return permissions
def getBase(self, response: Response) -> items.BaseItemLoader: base = LomBase.getBase(self, response) base.add_value("origin", response.meta["row"]["sourceTitle"].strip()) base.add_value("lastModified", response.meta["item"]["snippet"]["publishedAt"]) base.add_value("thumbnail", self.getThumbnailUrl(response)) base.add_value("fulltext", self.getFulltext(response)) return base
def getLOMTechnical(self, response): technical = LomBase.getLOMTechnical(self, response) technical.add_value("format", "text/html") technical.add_value("location", self.getUri(response)) technical.add_value("size", len(response.body)) return technical
def getLOMEducational(self, response): educational = LomBase.getLOMEducational(self, response) desc = response.meta['item'].xpath('beschreibung//text()').get().strip() # dirty cleaning of invalid descriptions # not perfect yet, these objects also appear inside the content if not desc.startswith('swiffyobject_'): educational.add_value('description', HTMLParser().unescape(desc)) return educational
def getLOMLifecycle(self, response: Response) -> items.LomLifecycleItemloader: lifecycle = LomBase.getLOMLifecycle(self, response) lifecycle.add_value("role", "author") lifecycle.add_value("organization", response.meta["item"]["snippet"]["channelTitle"]) lifecycle.add_value("url", self.getChannelUrl(response)) return lifecycle
def getValuespaces(self, response): valuespaces = LomBase.getValuespaces(self, response) valuespaces.add_value("educationalContext", "sekundarstufe_2") valuespaces.add_value("educationalContext", "berufliche_bildung") valuespaces.add_value("educationalContext", "erwachsenenbildung") valuespaces.add_value("discipline", "700") # Wirtschaftskunde valuespaces.add_value("discipline", "48005") # Gesellschaftskunde return valuespaces