def __init__(self, function): self.function = function self.total_pages = 0 self.pages_complete = 0 self.series_client = RSSeriesSearchClient() self.search_client = RSSearchClient() db = self.get_db() self.functions = db.functions self.series = db.series self.series.create_index('identifier', unique=True)
class SeriesHarvester(): """ Works through agencies associated with a harvested function. Grabs series in the function date range and gets some details. """ def __init__(self, function): self.function = function self.total_pages = 0 self.pages_complete = 0 self.series_client = RSSeriesSearchClient() self.search_client = RSSearchClient() db = self.get_db() self.functions = db.functions self.series = db.series self.series.create_index('identifier', unique=True) def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() return db def start_harvest(self): # Get agencies in function function = self.functions.find_one({'function': self.function}) agencies = [] for agency in function['agencies']: if agency['agency_id'] == 'CA 51': self.pages_complete = 0 page = 1 if not agency['function_start']['date']: function_start = '1900' else: function_start = str(agency['function_start']['date'].year) if not agency['function_end']['date']: function_end = datetime.datetime.now().year else: function_end = str(agency['function_end']['date'].year) series_list = [] self.prepare_harvest(agency, function_start, function_end) while self.pages_complete < self.total_pages: response = self.series_client.search_series( page=page, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end) for result in response['results']: if result['items_digitised'] == '20000+': result['items_digitised'] = 20000 elif result['items_digitised'] is None: result['items_digitised'] = 0 series = { 'series_id': result['identifier'], 'title': result['title'], 'items_described': int(result['items_described']['described_number']), 'items_digitised': result['items_digitised'] } try: self.search_client.search( digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end) series['items_described_in_period'] = int( self.search_client.total_results) except TooManyError: series['items_described_in_period'] = 20000 try: self.search_client.search( digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end, digital=['on']) series['items_digitised_in_period'] = int( self.search_client.total_results) except TooManyError: series['items_digitised_in_period'] = 20000 pp.pprint(series) series_list.append(series) try: self.series.insert(result) except DuplicateKeyError: pass self.pages_complete += 1 page += 1 time.sleep(1) agency['series'] = series_list agencies.append(agency) self.functions.update_one({'function': self.function}, {'$set': { 'agencies': agencies }}) def get_total(self): return self.client.total_results def prepare_harvest(self, agency, function_start, function_end): self.pages_complete = 0 print agency['agency_id'] self.series_client.search_series(results_per_page=0, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end) total_results = self.series_client.total_results if total_results is not None: print '{} series'.format(total_results) self.total_pages = (int(total_results) / self.series_client.results_per_page) + 1 print '{} pages'.format(self.total_pages) else: print 'No series'
class SeriesHarvester(): """ Works through agencies associated with a harvested function. Grabs series in the function date range and gets some details. """ def __init__(self, function): self.function = function self.total_pages = 0 self.pages_complete = 0 self.series_client = RSSeriesSearchClient() self.search_client = RSSearchClient() db = self.get_db() self.functions = db.functions self.series = db.series self.series.create_index('identifier', unique=True) def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() return db def start_harvest(self): # Get agencies in function function = self.functions.find_one({'function': self.function}) agencies = [] for agency in function['agencies']: if agency['agency_id'] == 'CA 51': self.pages_complete = 0 page = 1 if not agency['function_start']['date']: function_start = '1900' else: function_start = str(agency['function_start']['date'].year) if not agency['function_end']['date']: function_end = datetime.datetime.now().year else: function_end = str(agency['function_end']['date'].year) series_list = [] self.prepare_harvest(agency, function_start, function_end) while self.pages_complete < self.total_pages: response = self.series_client.search_series(page=page, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end) for result in response['results']: if result['items_digitised'] == '20000+': result['items_digitised'] = 20000 elif result['items_digitised'] is None: result['items_digitised'] = 0 series = { 'series_id': result['identifier'], 'title': result['title'], 'items_described': int(result['items_described']['described_number']), 'items_digitised': result['items_digitised'] } try: self.search_client.search(digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end) series['items_described_in_period'] = int(self.search_client.total_results) except TooManyError: series['items_described_in_period'] = 20000 try: self.search_client.search(digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end, digital=['on']) series['items_digitised_in_period'] = int(self.search_client.total_results) except TooManyError: series['items_digitised_in_period'] = 20000 pp.pprint(series) series_list.append(series) try: self.series.insert(result) except DuplicateKeyError: pass self.pages_complete += 1 page += 1 time.sleep(1) agency['series'] = series_list agencies.append(agency) self.functions.update_one({'function': self.function}, {'$set': {'agencies': agencies}}) def get_total(self): return self.client.total_results def prepare_harvest(self, agency, function_start, function_end): self.pages_complete = 0 print agency['agency_id'] self.series_client.search_series(results_per_page=0, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end) total_results = self.series_client.total_results if total_results is not None: print '{} series'.format(total_results) self.total_pages = (int(total_results) / self.series_client.results_per_page) + 1 print '{} pages'.format(self.total_pages) else: print 'No series'