def getValuespaces(self, response): valuespaces = LrmiBase.getValuespaces(self, response) try: range = response.xpath( '//ul[@class="sidebar__information"]/li[@class="sidebar__information-item"]/*[contains(@class,"icon-level")]/parent::*//text()' ).get().replace('Stufe', '').strip().split(' - ') if len(range): valuespaces.add_value( 'educationalContext', ValuespaceHelper.educationalContextByGrade(range)) except: pass try: discipline = response.xpath( '//ul[@class="sidebar__information"]/li[@class="sidebar__information-item"]/*[contains(@class,"icon-subject")]/parent::*//text()' ).getall() valuespaces.add_value('discipline', discipline) except: pass lrt = response.meta['item'].get('type') valuespaces.add_value('learningResourceType', lrt) try: toolType = list( map( lambda x: x.strip(), response.xpath( '//ul[@class="sidebar__information"]/li[@class="sidebar__information-item"]/*[contains(@class,"icon-settings")]/parent::*//text()' ).getall())) # @TODO: proper mapping, maybe specialised tool field? valuespaces.add_value('learningResourceType', toolType) except: pass return valuespaces
def getValuespaces(self, response): valuespaces = LrmiBase.getValuespaces(self, response) disciplines = ["politik", "geschichte"] for discipline in disciplines: if "/" + discipline in response.url: valuespaces.add_value("discipline", discipline) return valuespaces
def getLOMGeneral(self, response): general = LrmiBase.getLOMGeneral(self, response) general.replace_value( 'title', HTMLParser().unescape(response.meta['item'].get('name').strip())) #general.add_value('keyword', list(filter(lambda x: x,map(lambda x: x.strip(), response.xpath('//*[@id="ContentModuleApp"]//*[@class="topic-name"]//text()').getall())))) return general
def getBase(self, response): base = LrmiBase.getBase(self, response) #base.replace_value('thumbnail', self.url + '/media/' + response.meta['item'].get('image')) base.replace_value( 'thumbnail', self.url + response.xpath('//img[@class="content-info__image"]/@src').get()) base.replace_value('type', self.getType(response)) return base
def getLOMTechnical(self, response): technical = LrmiBase.getLOMTechnical(self, response) technical.replace_value("format", "text/html") # technical.add_value("size", self.getLRMI( # "ContentSize", response=response)) url = self.getLRMI("mainEntityOfPage", response=response) if not url: url = response.url technical.replace_value("location", url) return technical
def getLicense(self, response): license = LrmiBase.getLicense(self, response) license_value: str = response.xpath( "//div[@class='cc-license']/a/@href").get() if license_value: # remove language link from license if license_value.endswith("deed.de"): license_value = license_value[:-len("deed.de")] elif license_value.endswith("de/"): license_value = license_value[:-len("de/")] license.replace_value("url", license_value) return license
def getLOMGeneral(self, response): general = LrmiBase.getLOMGeneral(self, response) general.replace_value( "identifier", self.getLRMI("mainEntityOfPage", response=response)) # Keywords keywords: List[str] = [ keyword.strip() for keyword in self.getLRMI( "keywords", response=response).split(",") ] general.replace_value("keyword", keywords) # Language TODO fill in value by hand or leave empty? general.add_value("language", self.getLRMI("inLanguage", response=response)) # Description general.add_value("description", self.getLRMI("description", response=response)) return general
def getLOMLifecycle(self, response): name = self.getLRMI("author", response=response) lifecycle = LrmiBase.getLOMLifecycle(self, response) if name == "Bundeszentrale für politische Bildung": lifecycle.add_value("role", "author") # if author organization lifecycle.add_value("organization", name) elif name == "Redaktion": lifecycle.add_value("role", "author") # if author organization lifecycle.add_value("organization", name) elif "Redaktion werkstatt.bpb.de" in name: lifecycle.add_value("role", "author") # if author organization lifecycle.add_value("organization", name) elif ", " not in name: # maybe one author lifecycle.add_value("role", "author") author = name.split(" ") lifecycle.add_value("firstName", " ".join(author[:-1]).strip()) lifecycle.add_value("lastName", author[-1].strip()) elif ", " in name: for author_name in name.split(","): lifecycle.add_value("role", "author") author = author_name.split(" ") lifecycle.add_value("firstName", " ".join(author[:-1]).strip()) lifecycle.add_value("lastName", author[-1].strip()) elif "und" in name: for author_name in name.split("und"): lifecycle.add_value("role", "author") author = author_name.split(" ") lifecycle.add_value("firstName", " ".join(author[:-1]).strip()) lifecycle.add_value("lastName", author[-1].strip()) return lifecycle
def __init__(self, **kwargs): LrmiBase.__init__(self, **kwargs)
def getLicense(self, response): license = LrmiBase.getLicense(self, response) return license
def getLOMTechnical(self, response): technical = LrmiBase.getLOMTechnical(self, response) technical.replace_value('format', 'text/html') technical.replace_value('location', response.url) return technical
def getLOMEducational(self, response): educational = LrmiBase.getLOMEducational(self, response) educational.add_value( 'description', HTMLParser().unescape(response.meta['item'].get('teaser'))) return educational
def getBase(self, response): base = LrmiBase.getBase(self, response) base.replace_value("thumbnail", None) return base
def __init__(self, **kwargs): LrmiBase.__init__(self, **kwargs) CrawlSpider.__init__(self, **kwargs)
def mapResponse(self, response): return LrmiBase.mapResponse(self, response)
def handleEntry(self, response): return LrmiBase.parse(self, response)
def __init__(self, **kwargs): SitemapSpider.__init__(self) LrmiBase.__init__(self, **kwargs)
def parse(self, response): return LrmiBase.parse(self, response)
def getLOMTechnical(self, response): technical = LrmiBase.getLOMTechnical(self, response) technical.replace_value("format", "text/html") technical.replace_value("location", response.url) return technical