def main(): parser = OptionParser( description= "Parse Gutenberg RDF index file and produce SQLite database.") parser.add_option("--dbname", dest="db_filename", action="store", default="gutenberg.db", help="The gutenberg.db SQLite database") parser.add_option( "--rdfindex", dest="bz2_rdf_filename", action="store", default="catalog.rdf.bz2", help= "The filename for the bzip2 compressed XML RDF index for Project Gutenberg" ) (options, args) = parser.parse_args() make_db = GutenbergDbCreator(options.db_filename) index_filter = GutenbergIndexFilter() make_db.add_many_records( parse_rdf_bz2(options.bz2_rdf_filename, index_filter.filter)) if index_filter.notitle_count > 0: print "Omitted %d records without titles" % index_filter.notitle_count make_db.create_custom_title_order_index() make_db.compute_author_downloads() make_db.create_additional_indices()
def main(): parser = OptionParser(description="Parse Gutenberg RDF index file and produce SQLite database.") parser.add_option("--dbname", dest="db_filename", action="store", default="gutenberg.db", help="The gutenberg.db SQLite database") parser.add_option("--rdfindex", dest="bz2_rdf_filename", action="store", default="catalog.rdf.bz2", help="The filename for the bzip2 compressed XML RDF index for Project Gutenberg") (options, args) = parser.parse_args() make_db = GutenbergDbCreator(options.db_filename) index_filter = GutenbergIndexFilter() make_db.add_many_records(parse_rdf_bz2(options.bz2_rdf_filename, index_filter.filter)) if index_filter.notitle_count > 0: print "Omitted %d records without titles" % index_filter.notitle_count make_db.create_custom_title_order_index() make_db.compute_author_downloads() make_db.create_additional_indices()
def create_gutenberg_index_rdf(bz2_rdf_filename, indexdir): """Build whoosh index from parsed RDF. DB contents are no longer identical to RDF output. Plus index now stores selected db row ids. DEPRECATED""" sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # don't buffer stdout print "WARNING: direct use of rdf content may not accurately reflect database contents" schema = get_schema() whoosh_index = create_in(indexdir, schema) writer = whoosh_index.writer() for count, record in enumerate(gutenberg_rdf_parser.parse_rdf_bz2(bz2_rdf_filename, GutenbergIndexFilter().filter)): # Only index fields from description records. File records can be ignored. if record['record_type'] == 'DESCRIPTION': if count % 5000 == 0: print count, subset = {k : record[k] for k in schema.names() if k in record} writer.add_document(**subset) print "committing...", writer.commit() print "DONE"
def create_gutenberg_index_rdf(bz2_rdf_filename, indexdir): """Build whoosh index from parsed RDF. DB contents are no longer identical to RDF output. Plus index now stores selected db row ids. DEPRECATED""" sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # don't buffer stdout print "WARNING: direct use of rdf content may not accurately reflect database contents" schema = get_schema() whoosh_index = create_in(indexdir, schema) writer = whoosh_index.writer() for count, record in enumerate( gutenberg_rdf_parser.parse_rdf_bz2(bz2_rdf_filename, GutenbergIndexFilter().filter)): # Only index fields from description records. File records can be ignored. if record['record_type'] == 'DESCRIPTION': if count % 5000 == 0: print count, subset = {k: record[k] for k in schema.names() if k in record} writer.add_document(**subset) print "committing...", writer.commit() print "DONE"