Exemplo n.º 1
0
 def __init__(self, function):
     self.function = function
     self.total_pages = 0
     self.pages_complete = 0
     self.series_client = RSSeriesSearchClient()
     self.search_client = RSSearchClient()
     db = self.get_db()
     self.functions = db.functions
     self.series = db.series
     self.series.create_index('identifier', unique=True)
Exemplo n.º 2
0
 def __init__(self, harvest, **kwargs):
     self.total_pages = None
     self.pages_complete = 0
     self.client = RSSearchClient()
     self.prepare_harvest(access='Closed')
     db = self.get_db()
     self.items = db.items
     self.series = db.series
     self.harvests = db.harvests
     self.set_harvest(harvest)
 def __init__(self, function):
     self.function = function
     self.total_pages = 0
     self.pages_complete = 0
     self.series_client = RSSeriesSearchClient()
     self.search_client = RSSearchClient()
     db = self.get_db()
     self.functions = db.functions
     self.series = db.series
     self.series.create_index('identifier', unique=True)
Exemplo n.º 4
0
class SearchHarvester():
    """
    Harvest the details of 'Closed' files from RecordSearch.
    Saves to MongoDB.
    harvester = SearchHarvester(harvest='2015-12-31')
    harvester.start_harvest()
    """
    def __init__(self, harvest, **kwargs):
        self.total_pages = None
        self.pages_complete = 0
        self.client = RSSearchClient()
        self.prepare_harvest(access='Closed')
        db = self.get_db()
        self.items = db.items
        self.series = db.series
        self.harvests = db.harvests
        self.set_harvest(harvest)

    def set_harvest(self, harvest):
        # self.harvests.create_index('harvest_date', unique=True)
        dates = harvest.split('-')
        harvest_date = datetime(int(dates[0]), int(dates[1]), int(dates[2]), 0, 0, 0)
        try:
            self.harvests.insert({'harvest_date': harvest_date})
        except DuplicateKeyError:
            pass
        self.harvest_date = harvest_date

    def get_db(self):
        dbclient = MongoClient(MONGOLAB_URL)
        db = dbclient.get_default_database()
        return db

    def get_total(self):
        return self.client.total_results

    def prepare_harvest(self, **kwargs):
        self.client.search(**kwargs)
        total_results = self.client.total_results
        print '{} items'.format(total_results)
        self.total_pages = (int(total_results) / self.client.results_per_page) + 1
        print '{} pages'.format(self.total_pages)

    def start_harvest(self, page=None):
        item_client = RSItemClient()
        series_client = RSSeriesClient()
        # Refresh series with each harvest
        # self.series.remove({})
        # self.items.remove({})
        if not page:
            page = self.pages_complete + 1
        else:
            self.pages_complete = page - 1
        while self.pages_complete < self.total_pages:
            response = self.client.search(access='Closed', page=page, sort='9')
            for result in response['results']:
                exists = self.items.find_one({'_id': result['identifier'], 'harvests': self.harvest_date})
                if not exists:
                    item = item_client.get_summary(entity_id=result['identifier'])
                    item['_id'] = item['identifier']
                    item['random_id'] = [random.random(), 0]
                    # Normalise reasons
                    item['reasons'] = []
                    # item['year'] = item['contents_dates']['end_date']['date'].year
                    for reason in item['access_reason']:
                        matched = False
                        for exception, pattern in EXCEPTIONS:
                            if re.match(pattern, reason['reason']):
                                item['reasons'].append(exception)
                                matched = True
                        if not matched:
                            item['reasons'].append(reason['reason'])
                    # Get series and agency info
                    print item['series']
                    series = self.series.find_one({'identifier': item['series']})
                    if not series:
                        try:
                            series = series_client.get_summary(entity_id=item['series'], include_access_status=False)
                            # agencies = series_client.get_controlling_agencies(entity_id=item['series'])
                            # series['controlling_agencies'] = agencies
                            self.series.insert(series)
                        except UsageError:
                            series = None
                    if series:
                        item['series_title'] = series['title']
                        if series['controlling_agencies']:
                            item['agencies'] = []
                            for agency in series['controlling_agencies']:
                                if not agency['end_date'] or not agency['end_date']['date']:
                                    item['agencies'].append(agency)
                    item['harvests'] = [self.harvest_date]
                    self.items.insert_one(item)
                    print item['identifier']
            self.pages_complete += 1
            page += 1
            print '{} pages complete'.format(self.pages_complete)
            time.sleep(1)
Exemplo n.º 5
0
class SeriesHarvester():
    """
    Works through agencies associated with a harvested function.
    Grabs series in the function date range and gets some details.
    """
    def __init__(self, function):
        self.function = function
        self.total_pages = 0
        self.pages_complete = 0
        self.series_client = RSSeriesSearchClient()
        self.search_client = RSSearchClient()
        db = self.get_db()
        self.functions = db.functions
        self.series = db.series
        self.series.create_index('identifier', unique=True)

    def get_db(self):
        dbclient = MongoClient(MONGOLAB_URL)
        db = dbclient.get_default_database()
        return db

    def start_harvest(self):
        # Get agencies in function
        function = self.functions.find_one({'function': self.function})
        agencies = []
        for agency in function['agencies']:
            if agency['agency_id'] == 'CA 51':
                self.pages_complete = 0
                page = 1
                if not agency['function_start']['date']:
                    function_start = '1900'
                else:
                    function_start = str(agency['function_start']['date'].year)
                if not agency['function_end']['date']:
                    function_end = datetime.datetime.now().year
                else:
                    function_end = str(agency['function_end']['date'].year)
                series_list = []
                self.prepare_harvest(agency, function_start, function_end)
                while self.pages_complete < self.total_pages:
                    response = self.series_client.search_series(
                        page=page,
                        agency_recording=agency['agency_id'],
                        date_from=function_start,
                        date_to=function_end)
                    for result in response['results']:
                        if result['items_digitised'] == '20000+':
                            result['items_digitised'] = 20000
                        elif result['items_digitised'] is None:
                            result['items_digitised'] = 0
                        series = {
                            'series_id':
                            result['identifier'],
                            'title':
                            result['title'],
                            'items_described':
                            int(result['items_described']['described_number']),
                            'items_digitised':
                            result['items_digitised']
                        }
                        try:
                            self.search_client.search(
                                digitised=False,
                                series=result['identifier'],
                                date_from=function_start,
                                date_to=function_end)
                            series['items_described_in_period'] = int(
                                self.search_client.total_results)
                        except TooManyError:
                            series['items_described_in_period'] = 20000
                        try:
                            self.search_client.search(
                                digitised=False,
                                series=result['identifier'],
                                date_from=function_start,
                                date_to=function_end,
                                digital=['on'])
                            series['items_digitised_in_period'] = int(
                                self.search_client.total_results)
                        except TooManyError:
                            series['items_digitised_in_period'] = 20000
                        pp.pprint(series)
                        series_list.append(series)
                        try:
                            self.series.insert(result)
                        except DuplicateKeyError:
                            pass
                    self.pages_complete += 1
                    page += 1
                    time.sleep(1)
                agency['series'] = series_list
                agencies.append(agency)
            self.functions.update_one({'function': self.function},
                                      {'$set': {
                                          'agencies': agencies
                                      }})

    def get_total(self):
        return self.client.total_results

    def prepare_harvest(self, agency, function_start, function_end):
        self.pages_complete = 0
        print agency['agency_id']
        self.series_client.search_series(results_per_page=0,
                                         agency_recording=agency['agency_id'],
                                         date_from=function_start,
                                         date_to=function_end)
        total_results = self.series_client.total_results
        if total_results is not None:
            print '{} series'.format(total_results)
            self.total_pages = (int(total_results) /
                                self.series_client.results_per_page) + 1
            print '{} pages'.format(self.total_pages)
        else:
            print 'No series'
class SeriesHarvester():
    """
    Works through agencies associated with a harvested function.
    Grabs series in the function date range and gets some details.
    """

    def __init__(self, function):
        self.function = function
        self.total_pages = 0
        self.pages_complete = 0
        self.series_client = RSSeriesSearchClient()
        self.search_client = RSSearchClient()
        db = self.get_db()
        self.functions = db.functions
        self.series = db.series
        self.series.create_index('identifier', unique=True)

    def get_db(self):
        dbclient = MongoClient(MONGOLAB_URL)
        db = dbclient.get_default_database()
        return db

    def start_harvest(self):
        # Get agencies in function
        function = self.functions.find_one({'function': self.function})
        agencies = []
        for agency in function['agencies']:
            if agency['agency_id'] == 'CA 51':
                self.pages_complete = 0
                page = 1
                if not agency['function_start']['date']:
                    function_start = '1900'
                else:
                    function_start = str(agency['function_start']['date'].year)
                if not agency['function_end']['date']:
                    function_end = datetime.datetime.now().year
                else:
                    function_end = str(agency['function_end']['date'].year)
                series_list = []
                self.prepare_harvest(agency, function_start, function_end)
                while self.pages_complete < self.total_pages:
                    response = self.series_client.search_series(page=page, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end)
                    for result in response['results']:
                        if result['items_digitised'] == '20000+':
                            result['items_digitised'] = 20000
                        elif result['items_digitised'] is None:
                            result['items_digitised'] = 0
                        series = {
                            'series_id': result['identifier'],
                            'title': result['title'],
                            'items_described': int(result['items_described']['described_number']),
                            'items_digitised': result['items_digitised']
                        }
                        try:
                            self.search_client.search(digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end)
                            series['items_described_in_period'] = int(self.search_client.total_results)
                        except TooManyError:
                            series['items_described_in_period'] = 20000
                        try:
                            self.search_client.search(digitised=False, series=result['identifier'], date_from=function_start, date_to=function_end, digital=['on'])
                            series['items_digitised_in_period'] = int(self.search_client.total_results)
                        except TooManyError:
                            series['items_digitised_in_period'] = 20000
                        pp.pprint(series)
                        series_list.append(series)
                        try:
                            self.series.insert(result)
                        except DuplicateKeyError:
                            pass
                    self.pages_complete += 1
                    page += 1
                    time.sleep(1)
                agency['series'] = series_list
                agencies.append(agency)
            self.functions.update_one({'function': self.function}, {'$set': {'agencies': agencies}})

    def get_total(self):
        return self.client.total_results

    def prepare_harvest(self, agency, function_start, function_end):
        self.pages_complete = 0
        print agency['agency_id']
        self.series_client.search_series(results_per_page=0, agency_recording=agency['agency_id'], date_from=function_start, date_to=function_end)
        total_results = self.series_client.total_results
        if total_results is not None:
            print '{} series'.format(total_results)
            self.total_pages = (int(total_results) / self.series_client.results_per_page) + 1
            print '{} pages'.format(self.total_pages)
        else:
            print 'No series'