def _process_row(self, row): item = {} cells = row.findAll('td') item['series'] = cells[1].string.strip() item['control_symbol'] = cells[2].a.string.strip() item['title'] = cells[3].contents[0].string.strip() access_string = cells[3].find('div', 'CombinedTitleBottomLeft').string item['access_status'] = re.search(r'Access status: (\w+)', access_string).group(1) location_string = cells[3].find('div', 'CombinedTitleBottomRight').string item['location'] = re.search(r'Location: (\w+)', location_string).group(1) date_str = cells[4].string.strip() dates = utilities.process_date_string(date_str) date_range = {'date_str': date_str} date_range['start_date'] = utilities.convert_date_to_iso(dates['start_date']) date_range['end_date'] = utilities.convert_date_to_iso(dates['end_date']) item['contents_dates'] = date_range barcode = cells[7].string.strip() if cells[5].find('a') is not None: item['digitised_status'] = True if self.get_digitised: item['digitised_pages'] = self.get_digitised_pages(barcode) else: item['digitised_status'] = False if self.get_digitised: item['digitised_pages'] = 0 item['identifier'] = barcode return item
def _process_row(self, row): item = {} cells = row.findAll('td') item['series'] = cells[1].string.strip() item['control_symbol'] = cells[2].a.string.strip() item['title'] = cells[3].contents[0].string.strip() access_string = cells[3].find('div', 'CombinedTitleBottomLeft').string item['access_status'] = re.search(r'Access status: (\w+)', access_string).group(1) location_string = cells[3].find('div', 'CombinedTitleBottomRight').string item['location'] = re.search(r'Location: (\w+)', location_string).group(1) date_str = cells[4].string.strip() dates = utilities.process_date_string(date_str) date_range = {'date_str': date_str} date_range['start_date'] = utilities.convert_date_to_iso( dates['start_date']) date_range['end_date'] = utilities.convert_date_to_iso( dates['end_date']) item['date_range'] = date_range barcode = cells[7].string.strip() if cells[5].find('a') is not None: item['digitised_status'] = True item['digitised_pages'] = self.get_digitised_pages(barcode) else: item['digitised_status'] = False item['digitised_pages'] = 0 item['identifier'] = barcode return item
def test_process_date_string(self): cases = [ ('2 June 1884 - Sep 1884', { 'date_str': '2 June 1884 - Sep 1884', 'start_date': {'date': datetime.datetime(1884, 6, 2), 'day': True, 'month': True}, 'end_date': {'date': datetime.datetime(1884, 9, 1), 'day': False, 'month': True}, }), ] for case in cases: self.assertEqual(utilities.process_date_string(case[0]), case[1])
def test_process_date(self): cases = [ ('2 June 1884', [{'date': datetime.datetime(1884, 6, 2), 'day': True, 'month': True}]), ('03 Jul 1921', [{'date': datetime.datetime(1921, 7, 3), 'day': True, 'month': True}]), ('13 Jul. 1921', [{'date': datetime.datetime(1921, 7, 13), 'day': True, 'month': True}]), ('Dec 1778', [{'date': datetime.datetime(1778, 12, 1), 'day': False, 'month': True}]), ('1962', [{'date': datetime.datetime(1962, 1, 1), 'day': False, 'month': False}]), ('2 June 1884 - Sep 1884', [ {'date': datetime.datetime(1884, 6, 2), 'day': True, 'month': True}, {'date': datetime.datetime(1884, 9, 1), 'day': False, 'month': True}, ] ), ] for case in cases: self.assertEqual(utilities.process_date_string(case[0]), case[1])
def _get_formatted_dates(self, label, entity_id, date_format): try: date_str = self._get_value(label, entity_id) except AttributeError: dates = {'date_str': date_str, 'start_date': None, 'end_date': None} else: dates = utilities.process_date_string(date_str) if date_format == 'iso': formatted_dates = { 'date_str': date_str, 'start_date': utilities.convert_date_to_iso(dates['start_date']), 'end_date': utilities.convert_date_to_iso(dates['end_date']), } elif date_format == 'obj': formatted_dates = dates return formatted_dates
def _get_relations(self, label, entity_id, date_format): cell = self._get_cell(label, entity_id) relations = [] if cell is not None: for relation in cell.findAll('li'): try: date_str = relation.find('div', 'dates').string.strip() except AttributeError: dates = { 'date_str': date_str, 'start_date': None, 'end_date': None } else: dates = utilities.process_date_string(date_str) if date_format == 'iso': formatted_dates = { 'date_str': date_str, 'start_date': utilities.convert_date_to_iso(dates['start_date']), 'end_date': utilities.convert_date_to_iso(dates['end_date']), } elif date_format == 'obj': formatted_dates = dates details = [ string for string in relation.find( 'div', 'linkagesInfo').stripped_strings ] try: identifier = details[0] title = details[1][2:] except IndexError: identifier = details[0] title = details[0] relations.append({ 'date_str': formatted_dates['date_str'], 'start_date': formatted_dates['start_date'], 'end_date': formatted_dates['end_date'], 'identifier': identifier, 'title': title }) else: relations = None return relations
def _get_formatted_dates(self, label, entity_id, date_format): try: date_str = self._get_value(label, entity_id) except AttributeError: dates = { 'date_str': date_str, 'start_date': None, 'end_date': None } else: dates = utilities.process_date_string(date_str) if date_format == 'iso': formatted_dates = { 'date_str': date_str, 'start_date': utilities.convert_date_to_iso(dates['start_date']), 'end_date': utilities.convert_date_to_iso(dates['end_date']), } elif date_format == 'obj': formatted_dates = dates return formatted_dates
def _get_relations(self, label, entity_id, date_format): cell = self._get_cell(label, entity_id) relations = [] if cell is not None: for relation in cell.findAll('li'): try: date_str = relation.find('div', 'dates').string.strip() except AttributeError: date_str = '' dates = {'date_str': '', 'start_date': None, 'end_date': None} else: dates = utilities.process_date_string(date_str) if date_format == 'iso': formatted_dates = { 'date_str': date_str, 'start_date': utilities.convert_date_to_iso(dates['start_date']), 'end_date': utilities.convert_date_to_iso(dates['end_date']), } elif date_format == 'obj': formatted_dates = dates details = [string for string in relation.find('div', 'linkagesInfo').stripped_strings] try: identifier = details[0] title = details[1][2:] except IndexError: identifier = details[0] title = details[0] relations.append({ 'date_str': formatted_dates['date_str'], 'start_date': formatted_dates['start_date'], 'end_date': formatted_dates['end_date'], 'identifier': identifier, 'title': title }) else: relations = None return relations