def test_load_metadata(self): metadata = load_metadata() self.assertGreater(len(list(metadata[::PGTERMS.ebook])), 0) self.assertGreater(len(list(metadata[:DCTERMS.creator:])), 0) self.assertGreater(len(list(metadata[:DCTERMS.title:])), 0)
def generate_gutenberg_json(): """ Uses the Gutenberg module to fetch and write out a subset of the Project Gutenberg metadata properties in JSON to standard out. Returns: nothing, output is to standard out """ print "Populating Project Gutenberg RDF cache" print "If not already populated, this may take several hours..." try: cache = get_metadata_cache() cache.populate() except CacheAlreadyExistsException: pass # Don't reload the cache if it already exists print "Cache populated, iterating over ebooks" g = load_metadata() PGTERMS = Namespace('http://www.gutenberg.org/2009/pgterms/') DCAM = Namespace('http://purl.org/dc/dcam/') # Needed since PGTERMS.format is taken PURLFORMAT = URIRef('http://purl.org/dc/terms/format') returns = [] for ebook, unused, unused2 in g.triples((None, RDF.type, PGTERMS.ebook)): ret = {"_id": str(ebook).split("/")[-1]} try: ret["title"] = list(g.triples( (ebook, DCTERMS.title, None)))[0][2].value except IndexError: # If there's no title, don't index this ebook continue try: ret["num_downloads"] = list( g.triples((ebook, PGTERMS.downloads, None)))[0][2].value except IndexError: # If there's no title, don't index this ebook continue ret["publisher"] = list(g.triples( (ebook, DCTERMS.publisher, None)))[0][2].value try: creator = list(g.triples((ebook, DCTERMS.creator, None)))[0][2] ret["creator"] = list(g.triples( (creator, PGTERMS.name, None)))[0][2].value except IndexError: pass subjects = list(g.triples((ebook, DCTERMS.subject, None))) subject_values = [] for unused5, unused6, subject in subjects: # Use http://id.loc.gov/authorities/subjects.html # Library of Congress Subject Headings if list(g.triples( (subject, DCAM.memberOf, None)))[0][2] == DCTERMS.LCSH: subject_values.append( list(g.triples((subject, RDF.value, None)))[0][2].value) ret["subjects"] = subject_values try: bookshelf = list(g.triples((ebook, PGTERMS.bookshelf, None)))[0][2] bookshelf_value = unicode( list(g.triples((bookshelf, RDF.value, None)))[0][2]) ret["bookshelf"] = bookshelf_value except IndexError: pass formats = list() for unused3, unused4, has_format in list( g.triples((ebook, DCTERMS.hasFormat, None))): extent = list(g.triples( (has_format, DCTERMS.extent, None)))[0][2].value purl_format = list(g.triples((has_format, PURLFORMAT, None)))[0][2] purl_format_value = unicode( list(g.triples((purl_format, RDF.value, None)))[0][2]) if "html" in purl_format_value or "epub" in purl_format_value: format = dict({ "uri": has_format, "extent": extent, "media_type": purl_format_value }) formats.append(format) ret["formats"] = formats returns.append(ret) print dumps(returns, indent=4)
def test_load_metadata(self): metadata = load_metadata() self.assertTrue(len(list(metadata[::PGTERMS.ebook])) > 0) self.assertTrue(len(list(metadata[:DCTERMS.creator:])) > 0) self.assertTrue(len(list(metadata[:DCTERMS.title:])) > 0)
def generate_gutenberg_json(): """ Uses the Gutenberg module to fetch and write out a subset of the Project Gutenberg metadata properties in JSON to standard out. Returns: nothing, output is to standard out """ print "Populating Project Gutenberg RDF cache" print "If not already populated, this may take several hours..." try: cache = get_metadata_cache() cache.populate() except CacheAlreadyExistsException: pass # Don't reload the cache if it already exists print "Cache populated, iterating over ebooks" g = load_metadata() PGTERMS = Namespace("http://www.gutenberg.org/2009/pgterms/") DCAM = Namespace("http://purl.org/dc/dcam/") # Needed since PGTERMS.format is taken PURLFORMAT = URIRef("http://purl.org/dc/terms/format") returns = [] for ebook, unused, unused2 in g.triples((None, RDF.type, PGTERMS.ebook)): ret = {"_id": str(ebook).split("/")[-1]} try: ret["title"] = list(g.triples((ebook, DCTERMS.title, None)))[0][2].value except IndexError: # If there's no title, don't index this ebook continue try: ret["num_downloads"] = list(g.triples((ebook, PGTERMS.downloads, None)))[0][2].value except IndexError: # If there's no title, don't index this ebook continue ret["publisher"] = list(g.triples((ebook, DCTERMS.publisher, None)))[0][2].value try: creator = list(g.triples((ebook, DCTERMS.creator, None)))[0][2] ret["creator"] = list(g.triples((creator, PGTERMS.name, None)))[0][2].value except IndexError: pass subjects = list(g.triples((ebook, DCTERMS.subject, None))) subject_values = [] for unused5, unused6, subject in subjects: # Use http://id.loc.gov/authorities/subjects.html # Library of Congress Subject Headings if list(g.triples((subject, DCAM.memberOf, None)))[0][2] == DCTERMS.LCSH: subject_values.append(list(g.triples((subject, RDF.value, None)))[0][2].value) ret["subjects"] = subject_values try: bookshelf = list(g.triples((ebook, PGTERMS.bookshelf, None)))[0][2] bookshelf_value = unicode(list(g.triples((bookshelf, RDF.value, None)))[0][2]) ret["bookshelf"] = bookshelf_value except IndexError: pass formats = list() for unused3, unused4, has_format in list(g.triples((ebook, DCTERMS.hasFormat, None))): extent = list(g.triples((has_format, DCTERMS.extent, None)))[0][2].value purl_format = list(g.triples((has_format, PURLFORMAT, None)))[0][2] purl_format_value = unicode(list(g.triples((purl_format, RDF.value, None)))[0][2]) if "html" in purl_format_value or "epub" in purl_format_value: format = dict({"uri": has_format, "extent": extent, "media_type": purl_format_value}) formats.append(format) ret["formats"] = formats returns.append(ret) print dumps(returns, indent=4)