Exemplo n.º 1
0
    def handle(self, **options):

        LOGGER.info("indexing titles")
        index_titles()
        LOGGER.info("finished indexing titles")

        LOGGER.info("indexing pages")
        index_pages()
        LOGGER.info("finished indexing pages")
Exemplo n.º 2
0
    def handle(self, **options):

        _logger.info("indexing titles")
        index_titles()
        _logger.info("finished indexing titles")

        _logger.info("indexing pages")
        index_pages()
        _logger.info("finished indexing pages")
Exemplo n.º 3
0
    def handle(self, **options):

        LOGGER.info("indexing titles")
        index_titles()
        LOGGER.info("finished indexing titles")

        LOGGER.info("indexing pages")
        index_pages()
        LOGGER.info("finished indexing pages")
Exemplo n.º 4
0
    def xml_file_handler(self, marc_xml, skip_index):
        self.xml_start = datetime.now()
        results = title_loader.load(marc_xml)

        if not skip_index:
            # need to index any titles that we just created
            self.stdout.write("indexing new titles")
            index_titles(since=self.xml_start)

        return results
Exemplo n.º 5
0
    def xml_file_handler(self, marc_xml, skip_index):
        self.xml_start = datetime.now()
        results = title_loader.load(marc_xml)

        if not skip_index:
            # need to index any titles that we just created
            self.stdout.write("indexing new titles")
            index_titles(since=self.xml_start)

        return results
Exemplo n.º 6
0
    def handle(self, *args, **options):
        def get_immediate_subdirectories(a_dir):
            return [
                name for name in os.listdir(a_dir)
                if os.path.isdir(os.path.join(a_dir, name))
            ]

        def slack(message):
            sc.api_call("chat.postMessage", channel="#ghnp", text=message)

        start = datetime.now()

        sc = SlackClient(settings.SLACK_KEY)

        loader = BatchLoader()

        new_batches_location = '/opt/chronam/data/chronamftp/new_batches/'
        replacement_batches_location = '/opt/chronam/data/chronamftp/replacement_batches/'
        nonlccn_location = '/opt/chronam/data/nonlccn/'
        batch_drop = '/opt/chronam/data/dlg_batches/drop/'

        # GET LIST OF BATCHES TO LOAD
        new_batches = get_immediate_subdirectories(new_batches_location)
        replacement_batches = get_immediate_subdirectories(
            replacement_batches_location)

        # CHECK new_batches FOR finalMARC FOLDERS
        new_title_folders = []
        for folder in new_batches:
            if 'MARC' in folder:
                new_title_folders.append(folder)
                new_batches.remove(folder)

        # ISSUE STARTING NOTIFICATIONS
        slack(
            'Starting DLG Batch Load Process! Found `%s` new batches and `%s` replacement batches available to load.'
            % (len(new_batches), len(replacement_batches)))

        # RUN KEVIN'S RSYNC COMMANDS, WAIT
        slack('RSync of batches is starting')
        start_time = time.time()
        slack('Copying new batches')
        subprocess.call([
            'rsync -rav --progress /opt/chronam/data/chronamftp/new_batches/* /opt/chronam/data/dlg_batches/drop/'
        ])
        slack('Copying replacement batches')
        subprocess.call([
            'rsync -rav --progress /opt/chronam/data/chronamftp/replacement_batches/* /opt/chronam/data/dlg_batches/drop/'
        ])
        duration = time.time() - start_time
        slack('RSync of new and replacement batches completed in %s seconds' %
              duration)

        # LOAD NEW TITLES IF PRESENT
        if new_title_folders:
            slack('Also found `%s` title MARC files to process.' %
                  len(new_title_folders))
            for nt in new_title_folders:
                for nt_f in os.listdir(os.path.join(new_batches_location, nt)):
                    if nt_f.endswith('.xml'):
                        marc_file = os.path.join(nonlccn_location, nt_f)
                        copyfile(os.path.join(new_batches_location, nt, nt_f),
                                 marc_file)
                        title_load_results = title_loader.load(marc_file)
                        if title_load_results[1]:
                            slack('New title created from `%s`.' % nt_f)
                        if title_load_results[2]:
                            slack('Title updated from `%s`.' % nt_f)
                        if title_load_results[3]:
                            slack('Error on title load from `%s`' % nt_f)
            index_titles(start)
            slack('Finished loading titles.')

        # PURGE REPLACEMENT BATCHES
        if replacement_batches:
            slack('Purging batches destined for replacement.')
            for r_b in replacement_batches:
                batch_to_purge = r_b.replace('ver02','ver01')\
                    .replace('ver03','ver02')\
                    .replace('ver04','ver03')\
                    .replace('ver05','ver04')\
                    .replace('ver06','ver05')\
                    .replace('ver07','ver06')\
                    .replace('ver08','ver07')
                slack('Purging `%s`.' % batch_to_purge)
                loader.purge_batch(batch_to_purge)
            start_time = time.time()
            solr = SolrConnection(settings.SOLR)
            solr.optimize()
            slack('Index optimize complete in `%s` seconds.' % time.time() -
                  start_time)

        # LOAD ALL BATCHES
        # start with replacement batches
        final_loader = batch_loader.BatchLoader(process_ocr=True,
                                                process_coordinates=True)
        if replacement_batches:
            replace_start = time.time()
            for replacement in replacement_batches:
                final_loader.load_batch('drop/%s' % replacement, strict=False)
                slack('Loaded replacement batch `%s`.' % replacement)
            slack('All replacement batches loaded in `%s` seconds.' %
                  time.time() - replace_start)
        # load new batches
        if new_batches:
            new_start = time.time()
            for new in new_batches:
                final_loader.load_batch('drop/%s' % new, strict=False)
                slack('Loaded new batch `%s`.' % new)
            slack('All new batches loaded in `%s` seconds.' % time.time() -
                  new_start)

        slack('Batch loading job complete!')
Exemplo n.º 7
0
class Command(BaseCommand):

    skip_essays = make_option('--skip-essays',
                              action='store_true',
                              dest='skip_essays',
                              default=False,
                              help='Skip essay loading.')

    pull_title_updates = make_option('--pull-title-updates',
                                     action='store_true',
                                     dest='pull_title_updates',
                                     default=False,
                                     help='Pull down a new set of titles.')

    option_list = BaseCommand.option_list + (skip_essays, pull_title_updates)

    help = 'Runs title pull and title load for a complete title refresh.'
    args = ''

    def find_titles_not_updated(self, limited=True):
        _logger.info("Looking for titles not yet updated.")

        if Title.objects.count() == 0:
            _logger.info("Total number of titles not updated: 0")
            return Title.objects.values()
        elif limited:
            titles = Title.objects.order_by('-version').values(
                'lccn_orig', 'oclc', 'version')
            end = titles[0]['version']
        else:
            titles = Title.objects.order_by('-version')
            end = titles[0].version

        start = end - timedelta(weeks=2)
        titles = titles.exclude(version__range=(start, end))

        _logger.info("Total number of titles not updated: %s" % len(titles))
        return titles

    def pull_lccn_updates(self, titles):
        start = datetime.now()
        for t in titles:
            call_command('pull_titles', lccn=t['lccn_orig'], oclc=t['oclc'])
        end = datetime.now()
        total_time = end - start
        _logger.info('total time for pull_lccn_updates: %s' % total_time)
        return

    def handle(self, *args, **options):
        start = datetime.now()

        _logger.info("Starting title sync process.")
        # only load titles if the BIB_STORAGE is there, not always the case
        # for folks in the opensource world
        bib_in_settings = validate_bib_dir()
        if bib_in_settings:
            worldcat_dir = bib_in_settings + '/worldcat_titles/'

            pull_titles = bool(options['pull_title_updates']
                               and hasattr(settings, "WORLDCAT_KEY"))
            if pull_titles:
                call_command('pull_titles', )

            _logger.info("Starting load of OCLC titles.")
            bulk_dir = worldcat_dir + 'bulk'
            if os.path.isdir(bulk_dir):
                call_command('load_titles', bulk_dir, skip_index=True)

            tnu = self.find_titles_not_updated()

            # Only update by individual lccn if there are records that need updating.
            if pull_titles and len(tnu):
                _logger.info(
                    "Pulling titles from OCLC by individual lccn & oclc num.")
                self.pull_lccn_updates(tnu)

            _logger.info("Loading titles from second title pull.")
            lccn_dir = worldcat_dir + 'lccn'
            if os.path.isdir(lccn_dir):
                call_command('load_titles', lccn_dir, skip_index=True)

            tnu = self.find_titles_not_updated(limited=False)
            _logger.info("Running pre-deletion checks for these titles.")

        if bib_in_settings:
            if len(tnu):
                # Delete titles haven't been update &  issues attached.
                for title in tnu:
                    issues = title.issues.all()

                    error = "DELETION ERROR: Title %s has " % title
                    error_end = "It will not be deleted."

                    if issues:
                        _logger.warning(error + 'issues.' + error_end)
                        continue

            # Load holdings for all remaining titles.
            call_command('load_holdings')

        # overlay place info harvested from dbpedia onto the places table
        try:
            self.load_place_links()
        except Exception, e:
            _logger.exception(e)

        index.index_titles()

        # Time of full process run
        end = datetime.now()
        total_time = end - start
        _logger.info('start time: %s' % start)
        _logger.info('end time: %s' % end)
        _logger.info('total time: %s' % total_time)
        _logger.info("title_sync done.")
Exemplo n.º 8
0
    def handle(self, *args, **options):
        start = datetime.now()

        LOGGER.info("Starting title sync process.")
        # only load titles if the BIB_STORAGE is there, not always the case
        # for folks in the opensource world
        bib_in_settings = validate_bib_dir()
        if bib_in_settings:
            worldcat_dir = bib_in_settings + '/worldcat_titles/'

            pull_titles = bool(options['pull_title_updates']
                               and hasattr(settings, "WORLDCAT_KEY"))
            if pull_titles:
                call_command('pull_titles')

            LOGGER.info("Starting load of OCLC titles.")
            bulk_dir = worldcat_dir + 'bulk'
            if os.path.isdir(bulk_dir):
                call_command('load_titles', bulk_dir, skip_index=True)

            tnu = self.find_titles_not_updated()

            # Only update by individual lccn if there are records that need updating.
            if pull_titles and len(tnu):
                LOGGER.info(
                    "Pulling titles from OCLC by individual lccn & oclc num.")
                self.pull_lccn_updates(tnu)

            LOGGER.info("Loading titles from second title pull.")
            lccn_dir = worldcat_dir + 'lccn'
            if os.path.isdir(lccn_dir):
                call_command('load_titles', lccn_dir, skip_index=True)

            tnu = self.find_titles_not_updated(limited=False)
            LOGGER.info("Running pre-deletion checks for these titles.")

        # Make sure that our essays are up to date
        if not options['skip_essays']:
            load_essays(settings.ESSAYS_FEED)

        if bib_in_settings:
            if len(tnu):
                # Delete titles haven't been update & do not have essays or issues attached.
                for title in tnu:
                    essays = title.essays.all()
                    issues = title.issues.all()

                    error = "DELETION ERROR: Title %s has " % title
                    error_end = "It will not be deleted."

                    if not essays or not issues:
                        delete_txt = (title.name, title.lccn, title.oclc)
                        LOGGER.info('TITLE DELETED: %s, lccn: %s, oclc: %s' %
                                    delete_txt)
                        title.delete()
                    elif essays:
                        LOGGER.warning(error + 'essays.' + error_end)
                        continue
                    elif issues:
                        LOGGER.warning(error + 'issues.' + error_end)
                        continue

            # Load holdings for all remaining titles.
            call_command('load_holdings')

        # overlay place info harvested from dbpedia onto the places table
        try:
            self.load_place_links()
        except Exception as e:
            LOGGER.exception(e)

        index.index_titles()

        # Time of full process run
        end = datetime.now()
        total_time = end - start
        LOGGER.info('start time: %s' % start)
        LOGGER.info('end time: %s' % end)
        LOGGER.info('total time: %s' % total_time)
        LOGGER.info("title_sync done.")
Exemplo n.º 9
0
    def handle(self, *args, **options):
        start = datetime.now()

        LOGGER.info("Starting title sync process.")
        # only load titles if the BIB_STORAGE is there, not always the case
        # for folks in the opensource world
        bib_in_settings = validate_bib_dir()
        if bib_in_settings:
            worldcat_dir = bib_in_settings + '/worldcat_titles/'

            pull_titles = bool(options['pull_title_updates'] and hasattr(settings, "WORLDCAT_KEY"))
            if pull_titles:
                call_command('pull_titles')

            LOGGER.info("Starting load of OCLC titles.")
            bulk_dir = worldcat_dir + 'bulk'
            if os.path.isdir(bulk_dir):
                call_command('load_titles', bulk_dir, skip_index=True)

            tnu = self.find_titles_not_updated()

            # Only update by individual lccn if there are records that need updating.
            if pull_titles and len(tnu):
                LOGGER.info("Pulling titles from OCLC by individual lccn & oclc num.")
                self.pull_lccn_updates(tnu)

            LOGGER.info("Loading titles from second title pull.")
            lccn_dir = worldcat_dir + 'lccn'
            if os.path.isdir(lccn_dir):
                call_command('load_titles', lccn_dir, skip_index=True)

            tnu = self.find_titles_not_updated(limited=False)
            LOGGER.info("Running pre-deletion checks for these titles.")

        # Make sure that our essays are up to date
        if not options['skip_essays']:
            load_essays(settings.ESSAYS_FEED)

        if bib_in_settings:
            if len(tnu):
                # Delete titles haven't been update & do not have essays or issues attached.
                for title in tnu:
                    essays = title.essays.all()
                    issues = title.issues.all()

                    error = "DELETION ERROR: Title %s has " % title
                    error_end = "It will not be deleted."

                    if not essays or not issues:
                        delete_txt = (title.name, title.lccn, title.oclc)
                        LOGGER.info('TITLE DELETED: %s, lccn: %s, oclc: %s' % delete_txt)
                        title.delete()
                    elif essays:
                        LOGGER.warning(error + 'essays.' + error_end)
                        continue
                    elif issues:
                        LOGGER.warning(error + 'issues.' + error_end)
                        continue

            # Load holdings for all remaining titles.
            call_command('load_holdings')

        # overlay place info harvested from dbpedia onto the places table
        try:
            self.load_place_links()
        except Exception as e:
            LOGGER.exception(e)

        index.index_titles()

        # Time of full process run
        end = datetime.now()
        total_time = end - start
        LOGGER.info('start time: %s' % start)
        LOGGER.info('end time: %s' % end)
        LOGGER.info('total time: %s' % total_time)
        LOGGER.info("title_sync done.")