def _load_data(self): ''' Returns an RDF representation of a painting on google art ''' # Load HTML page document = BeautifulSoup(urllib2.urlopen(GOOGLE_ROOT + self.url).read()) info_block = document.find(id='info') # Create the resource this = GART[self.url] self.graph.add((this, RDF.type, DCMI['StillImage'])) self.graph.add((this, RDF.type, DCMI['PhysicalResource'])) # Get general information content = info_block.find(attrs={'class':'content'}) self.graph.add((this, DCT['title'], Literal(content.find('h2').string))) self.graph.add((this, DCT['description'], Literal(content.find(attrs={'class':'altLang'}).string))) date_raw = content.find(attrs={'class':'year'}).string date_raw = re.search('([0-9]{4})', date_raw).group(0) self.graph.add((this, DCT['created'], Literal(date(int(date_raw), 1 , 1)))) painter = get_dbpedia_resource(content.find('h3').string.split(',')[0]) if painter == None: painter = BNode() self.graph.add((painter, RDF.type, DCT['Agent'])) self.graph.add((painter, FOAF['name'], content.find('h3').string.split(',')[0])) self.graph.add((this, DCT['creator'], painter)) about_raw = content.findAll('p')[2].getText() medium = get_dbpedia_resource(about_raw[:about_raw.index('Height')]) if medium != None: self.graph.add((this, DCT['medium'], URIRef(medium))) self.graph.add((this, DBP['height'], Literal(about_raw[about_raw.index('Height') + len('Height : '):about_raw.index('Width')]))) self.graph.add((this, DBP['width'], Literal(about_raw[about_raw.index('Width') + len('Width : '):]))) # Get extra information content = info_block.find(attrs={'class':'exclusive'}) for link in content.findAll(attrs={'class':'outside'}): self.graph.add((this, FOAF['page'], URIRef(link.get('href')))) for block in content.findAll('dt'): if block.string == 'Tags': for list in block.findNextSiblings('dd')[0].findAll('tr'): for k in list.getText().split(','): self.graph.add((this, DCT['subject'], Literal(k))) if block.attrMap.has_key('class') and block.get('class') == 'artworkListTrigger': for link in block.findNextSiblings('dd')[0].findAll('a'): if link.attrMap['href'] != "#": self.graph.add((this, DCT['relation'], GARTW[link.get('href')[1:]]))
def _load_data(self): ''' Returns an RDF representation of the list of museums ''' document = BeautifulSoup(urllib2.urlopen(GOOGLE_ROOT).read()) list_block = document.find(id='list') for entry in list_block.findAll('li'): city = entry.find('a').find('span').getText() name = entry.get('data-bg-museum') url = entry.get('data-museum-url') this = GARTW['index.rdf'] museum = GARTW[url[1:]] self.graph.add((this, DCT['relation'], museum)) self.graph.add((museum, DCT['title'], Literal(name))) self.graph.add((museum, DCT['location'], get_dbpedia_resource(city)))