def _process_row(self, row): item = {} cells = row.findAll('td') item['series'] = cells[1].string.strip() item['control_symbol'] = cells[2].string.strip() item['title'] = cells[3].contents[0].string.strip() access_string = cells[3].find('div', 'CombinedTitleBottomLeft').string item['access_status'] = re.search(r'Access status: ([\w ]+)', access_string).group(1).strip() location_string = cells[3].find('div', 'CombinedTitleBottomRight').string item['location'] = re.search(r'Location: ([\w ]+)', location_string).group(1).strip() date_str = cells[4].string.strip() dates = utilities.process_date_string(date_str) date_range = {'date_str': date_str} date_range['start_date'] = utilities.convert_date_to_iso(dates['start_date']) date_range['end_date'] = utilities.convert_date_to_iso(dates['end_date']) item['contents_dates'] = date_range barcode = cells[6].string.strip() if cells[5].find('a') is not None: item['digitised_status'] = True if self.get_digitised: item['digitised_pages'] = self.get_digitised_pages(barcode) else: item['digitised_status'] = False if self.get_digitised: item['digitised_pages'] = 0 item['identifier'] = barcode return item
def write_csv(function=None): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() if function: query = {'function': function} else: query = {} functions = db.functions.find(query) for func in functions: filename = 'data/{}.csv'.format(func['function'].lower().replace( ' ', '_')) with open(filename, 'wb') as functions_file: functions_csv = csv.writer(functions_file) functions_csv.writerow([ 'agency_id', 'agency_title', 'agency_status', 'location', 'agency_start', 'agency_end', 'function_start', 'function_end' ]) for agency in func['agencies']: functions_csv.writerow([ agency['agency_id'], agency['title'].replace(u'\u2013', '-'), agency['agency_status'], agency['location'], convert_date_to_iso(agency['start_date']), convert_date_to_iso(agency['end_date']), convert_date_to_iso(agency['function_start']), convert_date_to_iso(agency['function_end']) ])
def write_csv(function=None): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() if function: query = {'function': function} else: query = {} functions = db.functions.find(query) for func in functions: filename = 'data/{}.csv'.format(func['function'].lower().replace(' ', '_')) with open(filename, 'wb') as functions_file: functions_csv = csv.writer(functions_file) functions_csv.writerow([ 'agency_id', 'agency_title', 'agency_status', 'location', 'agency_start', 'agency_end', 'function_start', 'function_end' ]) for agency in func['agencies']: functions_csv.writerow([ agency['agency_id'], agency['title'].replace(u'\u2013', '-'), agency['agency_status'], agency['location'], convert_date_to_iso(agency['start_date']), convert_date_to_iso(agency['end_date']), convert_date_to_iso(agency['function_start']), convert_date_to_iso(agency['function_end']) ])
def _get_formatted_dates(self, label, entity_id, date_format): try: date_str = self._get_value(label, entity_id) except AttributeError: dates = {'date_str': date_str, 'start_date': None, 'end_date': None} else: if date_str: dates = utilities.process_date_string(date_str) if date_format == 'iso': formatted_dates = { 'date_str': date_str, 'start_date': utilities.convert_date_to_iso(dates['start_date']), 'end_date': utilities.convert_date_to_iso(dates['end_date']), } elif date_format == 'obj': formatted_dates = dates else: formatted_dates = {'date_str': None, 'start_date': None, 'end_date': None} return formatted_dates
def summarise_agency(function, agency): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() func = db.functions.find_one({'function': function}) for ag in func['agencies']: if ag['agency_id'] == agency: agency_title = ag['title'] function_start = convert_date_to_iso(ag['function_start']) function_end = convert_date_to_iso(ag['function_end']) total_series = len(ag['series']) total_described = 0 total_digitised = 0 total_undescribed = 0 quantity_undescribed = 0 quantity_described = 0 for series in ag['series']: total_described += series['items_described_in_period'] total_digitised += series['items_digitised_in_period'] se = db.series.find_one({'identifier': series['series_id']}) quantity = 0 if 'locations' in se: for location in se['locations']: if 'quantity' in location: quantity += location['quantity'] if series['items_described'] == 0: total_undescribed += 1 quantity_undescribed += quantity else: quantity_described += quantity print 'Agency: {}, {}'.format(agency, agency_title) print 'Function: {} from {} to {}'.format(function, function_start, function_end) print 'Total series: {}'.format(total_series) print 'Items described: {}'.format(total_described) print 'Items digitised: {} ({:.2f}%)'.format( total_digitised, (float(total_digitised) / total_described) * 100) print 'Series with no items described: {} ({:.2f}%)'.format( total_undescribed, (float(total_undescribed) / total_series) * 100) print 'Quantity undescribed: at least {} of {} metres ({:.2f}%)'.format( quantity_undescribed, quantity_described, (float(quantity_undescribed) / quantity_described) * 100)
def _get_relations(self, label, entity_id, date_format): cell = self._get_cell(label, entity_id) relations = [] if cell is not None: for relation in cell.findAll('li'): try: date_str = relation.find('div', 'dates').string.strip() except AttributeError: date_str = '' dates = {'date_str': '', 'start_date': None, 'end_date': None} else: dates = utilities.process_date_string(date_str) if date_format == 'iso': formatted_dates = { 'date_str': date_str, 'start_date': utilities.convert_date_to_iso(dates['start_date']), 'end_date': utilities.convert_date_to_iso(dates['end_date']), } elif date_format == 'obj': formatted_dates = dates details = [string for string in relation.find('div', 'linkagesInfo').stripped_strings] try: identifier = details[0] title = details[1][2:] except IndexError: identifier = details[0] title = details[0] relations.append({ 'date_str': formatted_dates['date_str'], 'start_date': formatted_dates['start_date'], 'end_date': formatted_dates['end_date'], 'identifier': identifier, 'title': title }) relation.decompose() cell.decompose() else: relations = None return relations
def write_agency_csv(function, agency): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() func = db.functions.find_one({'function': function}) for ag in func['agencies']: if ag['agency_id'] == agency: filename = 'data/{}-{}-{}-{}.csv'.format( func['function'].lower().replace(' ', '_'), agency.replace(' ', '_'), convert_date_to_iso(ag['function_start']), convert_date_to_iso(ag['function_end'])) with open(filename, 'wb') as agency_file: agency_csv = csv.writer(agency_file) agency_csv.writerow([ 'series_id', 'series_title', 'number_described', 'number_digitised' ]) for series in ag['series']: agency_csv.writerow([ series['series_id'], series['title'], series['items_described_in_period'], series['items_digitised_in_period'] ])
def summarise_agency(function, agency): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() func = db.functions.find_one({'function': function}) for ag in func['agencies']: if ag['agency_id'] == agency: agency_title = ag['title'] function_start = convert_date_to_iso(ag['function_start']) function_end = convert_date_to_iso(ag['function_end']) total_series = len(ag['series']) total_described = 0 total_digitised = 0 total_undescribed = 0 quantity_undescribed = 0 quantity_described = 0 for series in ag['series']: total_described += series['items_described_in_period'] total_digitised += series['items_digitised_in_period'] se = db.series.find_one({'identifier': series['series_id']}) quantity = 0 if 'locations' in se: for location in se['locations']: if 'quantity' in location: quantity += location['quantity'] if series['items_described'] == 0: total_undescribed += 1 quantity_undescribed += quantity else: quantity_described += quantity print 'Agency: {}, {}'.format(agency, agency_title) print 'Function: {} from {} to {}'.format(function, function_start, function_end) print 'Total series: {}'.format(total_series) print 'Items described: {}'.format(total_described) print 'Items digitised: {} ({:.2f}%)'.format(total_digitised, (float(total_digitised)/total_described)*100) print 'Series with no items described: {} ({:.2f}%)'.format(total_undescribed, (float(total_undescribed)/total_series)*100) print 'Quantity undescribed: at least {} of {} metres ({:.2f}%)'.format(quantity_undescribed, quantity_described, (float(quantity_undescribed)/quantity_described)*100)
def write_agency_csv(function, agency): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() func = db.functions.find_one({'function': function}) for ag in func['agencies']: if ag['agency_id'] == agency: filename = 'data/{}-{}-{}-{}.csv'.format(func['function'].lower().replace(' ', '_'), agency.replace(' ', '_'), convert_date_to_iso(ag['function_start']), convert_date_to_iso(ag['function_end'])) with open(filename, 'wb') as agency_file: agency_csv = csv.writer(agency_file) agency_csv.writerow([ 'series_id', 'series_title', 'number_described', 'number_digitised' ]) for series in ag['series']: agency_csv.writerow([ series['series_id'], series['title'], series['items_described_in_period'], series['items_digitised_in_period'] ])