import logging import os from django.core import management from django.core.management.base import BaseCommand from core import models from core.holding_loader import HoldingLoader from core.management.commands import configure_logging from core.utils.utils import validate_bib_dir configure_logging('load_holdings_logging.config', 'load_holdings.log') _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Load a holdings records after title records are all loaded" args = '<location of holdings directory>' bib_in_settings = validate_bib_dir() if bib_in_settings: default_location = bib_in_settings + '/holdings' else: default_location = None def handle(self, holdings_source=default_location, *args, **options): if not os.path.exists(holdings_source): _logger.error("There is no valid holdings source folder defined.") set_holdings = [ 'To load holdings - Add a folder called "holdings"',
import os import logging from optparse import make_option from django.conf import settings from django.db import connection from django.core.management.base import BaseCommand, CommandError from solr import SolrConnection from core.batch_loader import BatchLoader, BatchLoaderException from core.management.commands import configure_logging configure_logging('purge_batches_logging.config', 'purge_batch_%s.log' % os.getpid()) log = logging.getLogger(__name__) class Command(BaseCommand): help = "Purge a batch" def add_arguments(self, parser): # Positional arguments parser.add_argument('batch_name', help='Batch name from "batches" command') # Options parser.add_argument( '--optimize', action='store_true', default=False, dest='optimize',
import logging import urllib2 from django.core.management.base import BaseCommand from django.db import reset_queries from rdflib import Namespace, ConjunctiveGraph, URIRef try: import simplejson as json except ImportError: import json from core import models from core.management.commands import configure_logging configure_logging("openoni_link_places.config", "openoni_link_places.log") _logger = logging.getLogger(__name__) geo = Namespace('http://www.w3.org/2003/01/geo/wgs84_pos#') owl = Namespace('http://www.w3.org/2002/07/owl#') dbpedia = Namespace('http://dbpedia.org/ontology/') class Command(BaseCommand): def handle(self, **options): _logger.debug("linking places") for place in models.Place.objects.filter(dbpedia__isnull=True): if not place.city or not place.state: continue # formulate a dbpedia place uri
import os import logging from optparse import make_option from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core.batch_loader import BatchLoader, BatchLoaderException from core.management.commands import configure_logging configure_logging('load_batch_logging.config', 'load_batch_%s.log' % os.getpid()) LOGGER = logging.getLogger(__name__) class Command(BaseCommand): help = """ This command loads the metadata and pages associated with a batch into a database and search index. It may take up to several hours to complete, depending on the batch size and machine. """ def add_arguments(self, parser): # Positional arguments parser.add_argument('batch_path', help='Path to batch files') # Options parser.add_argument('--skip-coordinates', action='store_true',
from django.core.management.base import BaseCommand from core.management.commands import configure_logging from core.solr_index import index_pages configure_logging("index_pages_logging.config", "index_pages.log") class Command(BaseCommand): def handle(self, **options): index_pages()
from django.conf import settings from django.core.management import call_command from django.core.management.base import BaseCommand try: import simplejson as json except ImportError: import json from openoni import core from core import solr_index from core.management.commands import configure_logging from core.models import Place, Title from core.utils.utils import validate_bib_dir configure_logging("title_sync_logging.config", "title_sync.log") _logger = logging.getLogger(__name__) class Command(BaseCommand): pull_title_updates = make_option('--pull-title-updates', action='store_true', dest='pull_title_updates', default=False, help='Pull down a new set of titles.') option_list = BaseCommand.option_list + (pull_title_updates) help = 'Runs title pull and title load for a complete title refresh.' args = ''
import csv from optparse import make_option from time import mktime from datetime import datetime import feedparser from django.core.management.base import BaseCommand from django.conf import settings from core.management.commands import configure_logging from core.rdf import rdf_uri from core import models as m configure_logging("release.config", "release.log") _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Updates (Resets if --reset option is used) release datetime on batches from one of following sources (in order of preference) 1. bag-info.txt, if found in the batch source 2. If path to a file is provided with the command, datetime is extracted from the file 3. current public feed 4. current server datetime" reset = make_option('--reset', action = 'store_true', dest = 'reset', default = False, help = 'reset release times to nothing before setting them again') option_list = BaseCommand.option_list + (reset, ) def handle(self, *args, **options): if options['reset']:
import logging from django.core.cache import cache from django.core.management.base import BaseCommand, CommandError from core.management.commands import configure_logging configure_logging('', 'purge_django_cache.log' ) LOGGER = logging.getLogger(__name__) class Command(BaseCommand): help = "Purge the django cache after ingest/purge of a batch" def handle(self, *args, **options): try: # delete the total pages count LOGGER.info('removing newspaper_info from cache') cache.delete('newspaper_info') # delete the advanced search title list LOGGER.info('removing titles_states from cache') cache.delete('titles_states') except Exception, e: LOGGER.exception(e) raise CommandError("unable to purge the cache. check the purge_batch_cache log for clues")
import logging from django.core.management.base import BaseCommand from core.management.commands import configure_logging from core.solr_index import index_titles, index_pages configure_logging("index_logging.config", "index.log") _logger = logging.getLogger(__name__) class Command(BaseCommand): help = """ Rebuilds the entire title and page index data in Solr, including the page OCR data. It shouldn't be necessary most of the time, but it can be useful to run if Solr data becomes corrupt (though this is a very rare occurrence), or in cases the Solr index must be deleted, e.g., if you upgrade to a new major version of Solr. *If Solr corruption is suspected, you should run the `zap_index` command prior to reindexing.* This command can take a while to run, because every single page has OCR data which Solr has to index in order to facilitate full-text searching. Plan for 60 to 90 minutes per 100,000 pages in your collection. """ def handle(self, **options): _logger.info("indexing titles")
import os import logging from optparse import make_option from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core import models from core.management.commands import configure_logging configure_logging('diff_batches_logging.config', 'diff_batches_%s.log' % os.getpid()) _logger = logging.getLogger(__name__) class Command(BaseCommand): option_list = BaseCommand.option_list + ( make_option('--skip-process-ocr', action='store_false', dest='process_ocr', default=True, help='Do not generate ocr, and index'), ) help = "Diff batches by name from a batch list file" args = '<batch_list_filename>' def handle(self, batch_list_filename, *args, **options): if len(args)!=0: raise CommandError('Usage is diff_batch %s' % self.args) batches = set()
import logging import os from django.core import management from django.core.management.base import BaseCommand from core import models from core.holding_loader import HoldingLoader from core.management.commands import configure_logging from core.utils.utils import validate_bib_dir configure_logging('load_holdings_logging.config', 'load_holdings.log') _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Load a holdings records after title records are all loaded" args = '<location of holdings directory>' bib_in_settings = validate_bib_dir() if bib_in_settings: default_location = bib_in_settings + '/holdings' else: default_location = None def handle(self, holdings_source=default_location, *args, **options): if not os.path.exists(holdings_source): _logger.error("There is no valid holdings source folder defined.") set_holdings = ['To load holdings - Add a folder called "holdings"', 'to the bib directory that is set in settings',
import os import logging import datetime from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core.load_copyright_map import loadCopyrightMap from core.management.commands import configure_logging configure_logging("load_copyright_map_logging.config", "load_copyright_map.log") LOGGER = logging.getLogger(__name__) class Command(BaseCommand): help = "Add records to lccn-date-copyright maps table from input file." def add_arguments(self, parser): parser.add_argument('filepath', help="Path to input file") def handle(self, *args, **options): filepath = options['filepath'] try: loadCopyrightMap(filepath) except Exception as e: LOGGER.exception(e) raise CommandError("unable to load copyright maps. check the load_batch log for clues")
import logging from datetime import datetime from optparse import make_option from django.core.management.base import BaseCommand from django.conf import settings from core import title_pull from core.management.commands import configure_logging configure_logging('pull_titles_logging.config', 'pull_titles.log') _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Retrieve a fresh pull of titles from OCLC. \ #TODO: add a list of example commands." args = '' #TODO: Remove default from lccn option_list = BaseCommand.option_list + ( make_option('-l', '--lccn', action='store', dest='lccn', default=None, help="Pass a specific lccn to pull down updates from Worldcat."), make_option('-o', '--oclc', action='store', dest='oclc', default=None,
import logging from cStringIO import StringIO from optparse import make_option from django.core.management.base import BaseCommand import pymarc from core.management.commands import configure_logging from core import solr_index from core.models import Title configure_logging("openoni_purge_titles.config", "openoni_purge_etitles.log") _log = logging.getLogger(__name__) class Command(BaseCommand): """ Management command for purging title records which have an 856 field containing a link to Chronicling America, and which appear to be records for an electronic only version of a title 245 $h == [electronic resource]. The script is careful not to purge any records that have issues attached to them. If you want to see the records that will be purged use the --pretend option. """ option_list = BaseCommand.option_list + ( make_option('-p', '--pretend', dest='pretend', action='store_true'), )
import os import csv import codecs from django.core.management.base import BaseCommand from core.management.commands import configure_logging from core.models import Institution configure_logging("load_intitutions_logging.config", "load_institutions_%s.log" % os.getpid()) """ Simple command to load institution data obtained from the MySQL database running in the MARC Standards office. "oid","orgName","altname1","altname2","altname3","altname4","orgCode","lowercode","isilCode","obsoleteOrgCode","createDate","modifiedDate","address1","address2","address3","city","stateID","zip","countryID","ID","cname","prefix","searchable" 22035,"3Com Corporation Technical Library","","","","","CStcTCC","cstctcc","US-CStcTCC","","1995-10-19 00:00:00","1995-10-19 00:00:00","5400 Bayfront Plaza","","","Santa Clara",5,"95052",210,210,"United States","US","yes" """ class Command(BaseCommand): help = 'loads institution csv data into Institution table' args = '<institution_csv_file>' def handle(self, csv_file, *args, **options): for row in unicode_csv_reader(codecs.open(csv_file, encoding='utf-8')): if row[20] != 'United States': continue i = Institution() i.code = row[7].upper() i.name = row[1]
import os import logging from optparse import make_option from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core import batch_loader from core.management.commands import configure_logging configure_logging("load_batches_logging.config", "load_batches_%s.log" % os.getpid()) _logger = logging.getLogger(__name__) class Command(BaseCommand): option_list = BaseCommand.option_list + ( make_option( "--skip-process-ocr", action="store_false", dest="process_ocr", default=True, help="Do not generate ocr, and index", ), make_option( "--skip-process-coordinates", action="store_false", dest="process_ocr", default=True, help="Do not write out word coordinates", ),
import os import logging from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core.management.commands import configure_logging from core import tasks configure_logging('queue_process_coordinates.config', 'queue_process_coordinates_%s.log' % os.getpid()) LOGGER = logging.getLogger(__name__) class Command(BaseCommand): option_list = BaseCommand.option_list + ( ) help = "queue the word coordinates of a batch to be processed" args = '<batch name>' def handle(self, batch_name, *args, **options): if len(args)!=0: raise CommandError('Usage is queue_process_coordinates %s' % self.args) try: tasks.process_coordinates.delay(batch_name) except Exception, e: LOGGER.exception(e) raise CommandError("unable to process coordinates. check the queue_load_batch log for clues")
import os import logging from optparse import make_option from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core.batch_loader import BatchLoader, BatchLoaderException from core.management.commands import configure_logging configure_logging('load_batch_logging.config', 'load_batch_%s.log' % os.getpid()) LOGGER = logging.getLogger(__name__) class Command(BaseCommand): option_list = BaseCommand.option_list + ( make_option('--skip-process-ocr', action='store_false', dest='process_ocr', default=True, help='Do not generate ocr, and index'), make_option('--skip-coordinates', action='store_false', dest='process_coordinates', default=True, help='Do not out word coordinates'), ) help = "Load a batch" args = '<batch name>'
import logging from datetime import datetime import os from optparse import make_option from django.core.management.base import BaseCommand from core import title_loader from core.solr_index import index_titles from core.models import Title from core.management.commands import configure_logging configure_logging('load_titles_logging.config', 'load_titles.log') _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Load a marcxml file of title records" args = '<location of marcxml>' option_list = BaseCommand.option_list + (make_option('--skip-index', action='store_true', dest='skip_index', default=False, help="\ Skip the index process. Use this if you call this from \ another process such as 'openoni_sync'. If you call this \ directly, you don't want to use this flag. \ "), ) def __init__(self):
import os import logging from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core.management.commands import configure_logging from core import tasks configure_logging('queue_purge_batch_logging.config', 'queue_purge_batch_%s.log' % os.getpid()) LOGGER = logging.getLogger(__name__) class Command(BaseCommand): option_list = BaseCommand.option_list + () help = "queue a batch to be purged" args = '<batch name>' def handle(self, batch_name, *args, **options): if len(args) != 0: raise CommandError('Usage is queue_purge_batch %s' % self.args) try: tasks.purge_batch.delay(batch_name) except Exception, e: LOGGER.exception(e) raise CommandError( "unable to queue purge batch. check the queue_purge_batch log for clues" )
import logging import os from django.core.cache import cache from django.core.management.base import BaseCommand, CommandError from core.management.commands import configure_logging configure_logging('delete_cache_logging.config', 'delete_cache_%s.log' % os.getpid()) logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Delete newspaper info and title state cache" def handle(self, *args, **options): if len(args)!=0: raise CommandError('Usage is `manage.py delete_cache`') try: logger.info("Deleting newspaper_info cache...") cache.delete('newspaper_info') logger.info("Deleting titles_states cache...") cache.delete('titles_states') except Exception, e: logger.exception(e) raise CommandError("Unable to delete newspaper info and title state cache")
import json import logging import urllib from django.core.management.base import BaseCommand from django.db import reset_queries from core import models from core.management.commands import configure_logging configure_logging("openoni_map_places.config", "openoni_map_places.log") _logger = logging.getLogger("map_places") geonames_url="http://api.geonames.org/searchJSON" class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument("username", action="store", help="Username for the GeoNames API; set to 'demo' if you don't plan to pull any GeoNames data") parser.add_argument('state', action='store', help='State code (e.g., OR, NE, etc) for restricting the search results'), def handle(self, *args, **options): _logger.debug("Finding places in Geonames") # Gather up all the places' latitude and longitude data output_data = {} for place in models.Place.objects.all(): if not place.city: _logger.error("A place with no city exists in your database (%s)! " + "This is probably A Bad Thing (tm)." % place.name) continue
import os import logging from optparse import make_option from django.conf import settings from django.db import connection from django.core.management.base import BaseCommand, CommandError from solr import SolrConnection from core.batch_loader import BatchLoader, BatchLoaderException from core.management.commands import configure_logging configure_logging('purge_batches_logging.config', 'purge_batch_%s.log' % os.getpid()) log = logging.getLogger(__name__) class Command(BaseCommand): option_list = BaseCommand.option_list + ( make_option('--no-optimize', action='store_false', dest='optimize', default=True, help='Do not optimize Solr and MySQL after purge'), ) help = "Purge a batch" args = '<batch_location>' def handle(self, batch_location=None, *args, **options):
import os import logging from optparse import make_option from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core import models from core.management.commands import configure_logging configure_logging('diff_batches_logging.config', 'diff_batches_%s.log' % os.getpid()) _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Diff batches by name from a batch list file" def add_arguments(self, parser): # Positional arguments parser.add_argument('batch_list_filename') def handle(self, batch_list_filename, *args, **options): if len(args) != 0: raise CommandError('Usage is diff_batch %s' % self.args) batches = set() batch_list = file(batch_list_filename) _logger.info("batch_list_filename: %s" % batch_list_filename) for line in batch_list:
import logging from django.core.management.base import BaseCommand from core.management.commands import configure_logging from core.solr_index import index_titles configure_logging("index_titles_logging.config", "index_titles.log") _logger = logging.getLogger(__name__) class Command(BaseCommand): def handle(self, **options): _logger.info("indexing titles") index_titles() _logger.info("finished indexing titles")
import logging import urllib2 from django.core.management.base import BaseCommand from django.db import reset_queries from rdflib import Namespace, ConjunctiveGraph, URIRef try: import simplejson as json except ImportError: import json from core import models from core.management.commands import configure_logging configure_logging("openoni_link_places.config", "openoni_link_places.log") _logger = logging.getLogger(__name__) geo = Namespace('http://www.w3.org/2003/01/geo/wgs84_pos#') owl = Namespace('http://www.w3.org/2002/07/owl#') dbpedia = Namespace('http://dbpedia.org/ontology/') class Command(BaseCommand): def handle(self, **options): _logger.debug("linking places") for place in models.Place.objects.filter(dbpedia__isnull=True): if not place.city or not place.state: continue # formulate a dbpedia place uri path = urllib2.quote('%s,_%s' %
import logging from datetime import datetime from optparse import make_option from django.core.management.base import BaseCommand from django.conf import settings from core import title_pull from core.management.commands import configure_logging configure_logging('pull_titles_logging.config', 'pull_titles.log') _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Retrieve a fresh pull of titles from OCLC. \ #TODO: add a list of example commands." args = '' #TODO: Remove default from lccn option_list = BaseCommand.option_list + ( make_option( '-l', '--lccn', action='store', dest='lccn', default=None, help="Pass a specific lccn to pull down updates from Worldcat."), make_option('-o',
import os import logging from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core import batch_loader from core.management.commands import configure_logging configure_logging('process_coordinates_logging.config', 'process_coordinates_%s.log' % os.getpid()) _logger = logging.getLogger(__name__) class Command(BaseCommand): option_list = BaseCommand.option_list + ( ) help = "Process word coordinates for a batch by name from a batch list file" args = '<batch_list_filename>' def handle(self, batch_list_filename, *args, **options): if len(args)!=0: raise CommandError('Usage is process_coordinates %s' % self.args) loader = batch_loader.BatchLoader() batch_list = file(batch_list_filename) _logger.info("batch_list_filename: %s" % batch_list_filename) for line in batch_list: batch_name = line.strip() _logger.info("batch_name: %s" % batch_name)
import os import logging from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core.management.commands import configure_logging from core import tasks configure_logging('queue_purge_batch_logging.config', 'queue_purge_batch_%s.log' % os.getpid()) LOGGER = logging.getLogger(__name__) class Command(BaseCommand): option_list = BaseCommand.option_list + ( ) help = "queue a batch to be purged" args = '<batch name>' def handle(self, batch_name, *args, **options): if len(args)!=0: raise CommandError('Usage is queue_purge_batch %s' % self.args) try: tasks.purge_batch.delay(batch_name) except Exception, e: LOGGER.exception(e) raise CommandError("unable to queue purge batch. check the queue_purge_batch log for clues")
import logging from datetime import datetime import os from optparse import make_option from django.core.management.base import BaseCommand from core import title_loader from core.solr_index import index_titles from core.models import Title from core.management.commands import configure_logging configure_logging('load_titles_logging.config', 'load_titles.log') _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "Load a marcxml file of title records" args = '<location of marcxml>' option_list = BaseCommand.option_list + ( make_option('--skip-index', action='store_true', dest='skip_index', default=False, help="\ Skip the index process. Use this if you call this from \ another process such as 'openoni_sync'. If you call this \ directly, you don't want to use this flag. \ "), )
import glob import logging import os import requests from urllib import parse from django.core.management.base import BaseCommand from django.conf import settings from core.management.commands import configure_logging from solr import SolrConnection configure_logging('setup_index_logging.config', 'setup_index.log') _logger = logging.getLogger(__name__) fixture_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../fixtures')) schema_url = settings.SOLR_BASE_URL + '/api/cores/openoni/schema' # Copy fields are defined here because we have to manually check for dupes; for # some reason Solr doesn't do this for us, and will in fact allow dozens of the # same copy-field definition. The structure should be obvious, and is the # exact format Solr's API takes. copy_fields = [{ 'source': 'place_of_publication', 'dest': 'place_of_publication_facet' }, { 'source': 'subject', 'dest': 'subject_facet' }, { 'source': 'title', 'dest': 'title_facet'
import os import logging import datetime from django.core.management.base import BaseCommand from django.core.management.base import CommandError from core.load_copyright_map import loadCopyrightMap from core.management.commands import configure_logging configure_logging("load_copyright_map_logging.config", "load_copyright_map.log") LOGGER = logging.getLogger(__name__) class Command(BaseCommand): help = """ Defines rules for which titles should use a given rights statement for certain date ranges. Rights *must* first be loaded via the load_copyright command. Rules are composed of four-field tab-separated-values files, where each line indicates a single rule. The fields, in order, are LCCN, start date, end date, and rights URI. Start and end dates must be formatted as `YYYY-MM-DD`, e.g., `2001-09-08` means September 8th, 2001. Please note that loading the same file multiple times will result in duplicated data, and manual SQL may be needed to clean dupes from `core_lccndatecopyright`. """ def add_arguments(self, parser): parser.add_argument('filepath', help="Path to input file")
import logging import os from django.conf import settings from django.core.management.base import BaseCommand from core.management.commands import configure_logging from core.models import Batch, OcrDump configure_logging("dump_ocr_logging.config", "dump_ocr.log") _logger = logging.getLogger(__name__) class Command(BaseCommand): help = "looks for batches that need to have ocr dump files created" def handle(self, *args, **options): if not os.path.isdir(settings.OCR_DUMP_STORAGE): os.makedirs(settings.OCR_DUMP_STORAGE) for batch in Batch.objects.filter(ocr_dump__isnull=True): _logger.info("starting to dump ocr for %s", batch) try: if batch.ocr_dump: _logger.info("Ocr is already generated for %s", batch) continue except OcrDump.DoesNotExist: pass dump = OcrDump.new_from_batch(batch)