def __init__(self, function): self.function = function self.total_pages = 0 self.pages_complete = 0 self.series_client = RSSeriesSearchClient() self.search_client = RSSearchClient() db = self.get_db() self.functions = db.functions self.series = db.series self.series.create_index('identifier', unique=True)
def __init__(self, harvest, **kwargs): self.total_pages = None self.pages_complete = 0 self.client = RSSearchClient() self.prepare_harvest(access='Closed') db = self.get_db() self.items = db.items self.series = db.series self.harvests = db.harvests self.set_harvest(harvest)
class SearchHarvester(): """ Harvest the details of 'Closed' files from RecordSearch. Saves to MongoDB. harvester = SearchHarvester(harvest='2015-12-31') harvester.start_harvest() """ def __init__(self, harvest, **kwargs): self.total_pages = None self.pages_complete = 0 self.client = RSSearchClient() self.prepare_harvest(access='Closed') db = self.get_db() self.items = db.items self.series = db.series self.harvests = db.harvests self.set_harvest(harvest) def set_harvest(self, harvest): # self.harvests.create_index('harvest_date', unique=True) dates = harvest.split('-') harvest_date = datetime(int(dates[0]), int(dates[1]), int(dates[2]), 0, 0, 0) try: self.harvests.insert({'harvest_date': harvest_date}) except DuplicateKeyError: pass self.harvest_date = harvest_date def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() return db def get_total(self): return self.client.total_results def prepare_harvest(self, **kwargs): self.client.search(**kwargs) total_results = self.client.total_results print '{} items'.format(total_results) self.total_pages = (int(total_results) / self.client.results_per_page) + 1 print '{} pages'.format(self.total_pages) def start_harvest(self, page=None): item_client = RSItemClient() series_client = RSSeriesClient() # Refresh series with each harvest # self.series.remove({}) # self.items.remove({}) if not page: page = self.pages_complete + 1 else: self.pages_complete = page - 1 while self.pages_complete < self.total_pages: response = self.client.search(access='Closed', page=page, sort='9') for result in response['results']: exists = self.items.find_one({'_id': result['identifier'], 'harvests': self.harvest_date}) if not exists: item = item_client.get_summary(entity_id=result['identifier']) item['_id'] = item['identifier'] item['random_id'] = [random.random(), 0] # Normalise reasons item['reasons'] = [] # item['year'] = item['contents_dates']['end_date']['date'].year for reason in item['access_reason']: matched = False for exception, pattern in EXCEPTIONS: if re.match(pattern, reason['reason']): item['reasons'].append(exception) matched = True if not matched: item['reasons'].append(reason['reason']) # Get series and agency info print item['series'] series = self.series.find_one({'identifier': item['series']}) if not series: try: series = series_client.get_summary(entity_id=item['series'], include_access_status=False) # agencies = series_client.get_controlling_agencies(entity_id=item['series']) # series['controlling_agencies'] = agencies self.series.insert(series) except UsageError: series = None if series: item['series_title'] = series['title'] if series['controlling_agencies']: item['agencies'] = [] for agency in series['controlling_agencies']: if not agency['end_date'] or not agency['end_date']['date']: item['agencies'].append(agency) item['harvests'] = [self.harvest_date] self.items.insert_one(item) print item['identifier'] self.pages_complete += 1 page += 1 print '{} pages complete'.format(self.pages_complete) time.sleep(1)
class SeriesHarvester(): """ Works through agencies associated with a harvested function. Grabs series in the function date range and gets some details. """ def __init__(self, function): self.function = function self.total_pages = 0 self.pages_complete = 0 self.series_client = RSSeriesSearchClient() self.search_client = RSSearchClient() db = self.get_db() self.functions = db.functions self.series = db.series self.series.create_index('identifier', unique=True) def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() return db def start_harvest(self): # Get agencies in function function = self.functions.find_one({'function': self.function}) agencies = [] for agency in function['agencies']: if agency['agency_id'] == 'CA 51': self.pages_complete = 0 page = 1 if not agency['function_start']['date']: function_start = '1900' else: function_start = str(agency['function_start']['date'].year) if not agency['function_end']['date']: function_end = datetime.datetime.now().year else: function_end = str(agency['function_end']['date'].year) series_list = [] self.prepare_harvest(agency, function_start, function_end) while self.pages_complete < self.total_pages: response = self.series_client.search_series( page=page, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end) for result in response['results']: if result['items_digitised'] == '20000+': result['items_digitised'] = 20000 elif result['items_digitised'] is None: result['items_digitised'] = 0 series = { 'series_id': result['identifier'], 'title': result['title'], 'items_described': int(result['items_described']['described_number']), 'items_digitised': result['items_digitised'] } try: self.search_client.search( digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end) series['items_described_in_period'] = int( self.search_client.total_results) except TooManyError: series['items_described_in_period'] = 20000 try: self.search_client.search( digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end, digital=['on']) series['items_digitised_in_period'] = int( self.search_client.total_results) except TooManyError: series['items_digitised_in_period'] = 20000 pp.pprint(series) series_list.append(series) try: self.series.insert(result) except DuplicateKeyError: pass self.pages_complete += 1 page += 1 time.sleep(1) agency['series'] = series_list agencies.append(agency) self.functions.update_one({'function': self.function}, {'$set': { 'agencies': agencies }}) def get_total(self): return self.client.total_results def prepare_harvest(self, agency, function_start, function_end): self.pages_complete = 0 print agency['agency_id'] self.series_client.search_series(results_per_page=0, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end) total_results = self.series_client.total_results if total_results is not None: print '{} series'.format(total_results) self.total_pages = (int(total_results) / self.series_client.results_per_page) + 1 print '{} pages'.format(self.total_pages) else: print 'No series'
class SeriesHarvester(): """ Works through agencies associated with a harvested function. Grabs series in the function date range and gets some details. """ def __init__(self, function): self.function = function self.total_pages = 0 self.pages_complete = 0 self.series_client = RSSeriesSearchClient() self.search_client = RSSearchClient() db = self.get_db() self.functions = db.functions self.series = db.series self.series.create_index('identifier', unique=True) def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() return db def start_harvest(self): # Get agencies in function function = self.functions.find_one({'function': self.function}) agencies = [] for agency in function['agencies']: if agency['agency_id'] == 'CA 51': self.pages_complete = 0 page = 1 if not agency['function_start']['date']: function_start = '1900' else: function_start = str(agency['function_start']['date'].year) if not agency['function_end']['date']: function_end = datetime.datetime.now().year else: function_end = str(agency['function_end']['date'].year) series_list = [] self.prepare_harvest(agency, function_start, function_end) while self.pages_complete < self.total_pages: response = self.series_client.search_series(page=page, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end) for result in response['results']: if result['items_digitised'] == '20000+': result['items_digitised'] = 20000 elif result['items_digitised'] is None: result['items_digitised'] = 0 series = { 'series_id': result['identifier'], 'title': result['title'], 'items_described': int(result['items_described']['described_number']), 'items_digitised': result['items_digitised'] } try: self.search_client.search(digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end) series['items_described_in_period'] = int(self.search_client.total_results) except TooManyError: series['items_described_in_period'] = 20000 try: self.search_client.search(digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end, digital=['on']) series['items_digitised_in_period'] = int(self.search_client.total_results) except TooManyError: series['items_digitised_in_period'] = 20000 pp.pprint(series) series_list.append(series) try: self.series.insert(result) except DuplicateKeyError: pass self.pages_complete += 1 page += 1 time.sleep(1) agency['series'] = series_list agencies.append(agency) self.functions.update_one({'function': self.function}, {'$set': {'agencies': agencies}}) def get_total(self): return self.client.total_results def prepare_harvest(self, agency, function_start, function_end): self.pages_complete = 0 print agency['agency_id'] self.series_client.search_series(results_per_page=0, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end) total_results = self.series_client.total_results if total_results is not None: print '{} series'.format(total_results) self.total_pages = (int(total_results) / self.series_client.results_per_page) + 1 print '{} pages'.format(self.total_pages) else: print 'No series'