Exemplo n.º 1
0
 def _process_row(self, row):
     item = {}
     cells = row.findAll('td')
     item['series'] = cells[1].string.strip()
     item['control_symbol'] = cells[2].string.strip()
     item['title'] = cells[3].contents[0].string.strip()
     access_string = cells[3].find('div', 'CombinedTitleBottomLeft').string
     item['access_status'] = re.search(r'Access status: ([\w ]+)', access_string).group(1).strip()
     location_string = cells[3].find('div', 'CombinedTitleBottomRight').string
     item['location'] = re.search(r'Location: ([\w ]+)', location_string).group(1).strip()
     date_str = cells[4].string.strip()
     dates = utilities.process_date_string(date_str)
     date_range = {'date_str': date_str}
     date_range['start_date'] = utilities.convert_date_to_iso(dates['start_date'])
     date_range['end_date'] = utilities.convert_date_to_iso(dates['end_date'])
     item['contents_dates'] = date_range
     barcode = cells[6].string.strip()
     if cells[5].find('a') is not None:
         item['digitised_status'] = True
         if self.get_digitised:
             item['digitised_pages'] = self.get_digitised_pages(barcode)
     else:
         item['digitised_status'] = False
         if self.get_digitised:
             item['digitised_pages'] = 0
     item['identifier'] = barcode
     return item
def write_csv(function=None):
    dbclient = MongoClient(MONGOLAB_URL)
    db = dbclient.get_default_database()
    if function:
        query = {'function': function}
    else:
        query = {}
    functions = db.functions.find(query)
    for func in functions:
        filename = 'data/{}.csv'.format(func['function'].lower().replace(
            ' ', '_'))
        with open(filename, 'wb') as functions_file:
            functions_csv = csv.writer(functions_file)
            functions_csv.writerow([
                'agency_id', 'agency_title', 'agency_status', 'location',
                'agency_start', 'agency_end', 'function_start', 'function_end'
            ])
            for agency in func['agencies']:
                functions_csv.writerow([
                    agency['agency_id'],
                    agency['title'].replace(u'\u2013', '-'),
                    agency['agency_status'], agency['location'],
                    convert_date_to_iso(agency['start_date']),
                    convert_date_to_iso(agency['end_date']),
                    convert_date_to_iso(agency['function_start']),
                    convert_date_to_iso(agency['function_end'])
                ])
def write_csv(function=None):
    dbclient = MongoClient(MONGOLAB_URL)
    db = dbclient.get_default_database()
    if function:
        query = {'function': function}
    else:
        query = {}
    functions = db.functions.find(query)
    for func in functions:
        filename = 'data/{}.csv'.format(func['function'].lower().replace(' ', '_'))
        with open(filename, 'wb') as functions_file:
            functions_csv = csv.writer(functions_file)
            functions_csv.writerow([
                'agency_id',
                'agency_title',
                'agency_status',
                'location',
                'agency_start',
                'agency_end',
                'function_start',
                'function_end'
                ])
            for agency in func['agencies']:
                functions_csv.writerow([
                    agency['agency_id'],
                    agency['title'].replace(u'\u2013', '-'),
                    agency['agency_status'],
                    agency['location'],
                    convert_date_to_iso(agency['start_date']),
                    convert_date_to_iso(agency['end_date']),
                    convert_date_to_iso(agency['function_start']),
                    convert_date_to_iso(agency['function_end'])
                    ])
Exemplo n.º 4
0
 def _get_formatted_dates(self, label, entity_id, date_format):
     try:
         date_str = self._get_value(label, entity_id)
     except AttributeError:
         dates = {'date_str': date_str, 'start_date': None, 'end_date': None}
     else:
         if date_str:
             dates = utilities.process_date_string(date_str)
             if date_format == 'iso':
                 formatted_dates = {
                     'date_str': date_str,
                     'start_date': utilities.convert_date_to_iso(dates['start_date']),
                     'end_date': utilities.convert_date_to_iso(dates['end_date']),
                 }
             elif date_format == 'obj':
                 formatted_dates = dates
         else:
             formatted_dates = {'date_str': None, 'start_date': None, 'end_date': None}
     return formatted_dates
def summarise_agency(function, agency):
    dbclient = MongoClient(MONGOLAB_URL)
    db = dbclient.get_default_database()
    func = db.functions.find_one({'function': function})
    for ag in func['agencies']:
        if ag['agency_id'] == agency:
            agency_title = ag['title']
            function_start = convert_date_to_iso(ag['function_start'])
            function_end = convert_date_to_iso(ag['function_end'])
            total_series = len(ag['series'])
            total_described = 0
            total_digitised = 0
            total_undescribed = 0
            quantity_undescribed = 0
            quantity_described = 0
            for series in ag['series']:
                total_described += series['items_described_in_period']
                total_digitised += series['items_digitised_in_period']
                se = db.series.find_one({'identifier': series['series_id']})
                quantity = 0
                if 'locations' in se:
                    for location in se['locations']:
                        if 'quantity' in location:
                            quantity += location['quantity']
                if series['items_described'] == 0:
                    total_undescribed += 1
                    quantity_undescribed += quantity
                else:
                    quantity_described += quantity
    print 'Agency: {}, {}'.format(agency, agency_title)
    print 'Function: {} from {} to {}'.format(function, function_start,
                                              function_end)
    print 'Total series: {}'.format(total_series)
    print 'Items described: {}'.format(total_described)
    print 'Items digitised: {} ({:.2f}%)'.format(
        total_digitised, (float(total_digitised) / total_described) * 100)
    print 'Series with no items described: {} ({:.2f}%)'.format(
        total_undescribed, (float(total_undescribed) / total_series) * 100)
    print 'Quantity undescribed: at least {} of {} metres ({:.2f}%)'.format(
        quantity_undescribed, quantity_described,
        (float(quantity_undescribed) / quantity_described) * 100)
Exemplo n.º 6
0
 def _get_relations(self, label, entity_id, date_format):
     cell = self._get_cell(label, entity_id)
     relations = []
     if cell is not None:
         for relation in cell.findAll('li'):
             try:
                 date_str = relation.find('div', 'dates').string.strip()
             except AttributeError:
                 date_str = ''
                 dates = {'date_str': '', 'start_date': None, 'end_date': None}
             else:
                 dates = utilities.process_date_string(date_str)
             if date_format == 'iso':
                 formatted_dates = {
                     'date_str': date_str,
                     'start_date': utilities.convert_date_to_iso(dates['start_date']),
                     'end_date': utilities.convert_date_to_iso(dates['end_date']),
                 }
             elif date_format == 'obj':
                 formatted_dates = dates
             details = [string for string in relation.find('div', 'linkagesInfo').stripped_strings]
             try:
                 identifier = details[0]
                 title = details[1][2:]
             except IndexError:
                 identifier = details[0]
                 title = details[0]
             relations.append({
                 'date_str': formatted_dates['date_str'],
                 'start_date': formatted_dates['start_date'],
                 'end_date': formatted_dates['end_date'],
                 'identifier': identifier,
                 'title': title
             })
             relation.decompose()
         cell.decompose()
     else:
         relations = None
     return relations
def write_agency_csv(function, agency):
    dbclient = MongoClient(MONGOLAB_URL)
    db = dbclient.get_default_database()
    func = db.functions.find_one({'function': function})
    for ag in func['agencies']:
        if ag['agency_id'] == agency:
            filename = 'data/{}-{}-{}-{}.csv'.format(
                func['function'].lower().replace(' ', '_'),
                agency.replace(' ', '_'),
                convert_date_to_iso(ag['function_start']),
                convert_date_to_iso(ag['function_end']))
            with open(filename, 'wb') as agency_file:
                agency_csv = csv.writer(agency_file)
                agency_csv.writerow([
                    'series_id', 'series_title', 'number_described',
                    'number_digitised'
                ])
                for series in ag['series']:
                    agency_csv.writerow([
                        series['series_id'], series['title'],
                        series['items_described_in_period'],
                        series['items_digitised_in_period']
                    ])
def summarise_agency(function, agency):
    dbclient = MongoClient(MONGOLAB_URL)
    db = dbclient.get_default_database()
    func = db.functions.find_one({'function': function})
    for ag in func['agencies']:
        if ag['agency_id'] == agency:
            agency_title = ag['title']
            function_start = convert_date_to_iso(ag['function_start'])
            function_end = convert_date_to_iso(ag['function_end'])
            total_series = len(ag['series'])
            total_described = 0
            total_digitised = 0
            total_undescribed = 0
            quantity_undescribed = 0
            quantity_described = 0
            for series in ag['series']:
                total_described += series['items_described_in_period']
                total_digitised += series['items_digitised_in_period']
                se = db.series.find_one({'identifier': series['series_id']})
                quantity = 0
                if 'locations' in se:
                    for location in se['locations']:
                        if 'quantity' in location:
                            quantity += location['quantity']
                if series['items_described'] == 0:
                    total_undescribed += 1
                    quantity_undescribed += quantity
                else:
                    quantity_described += quantity
    print 'Agency: {}, {}'.format(agency, agency_title)
    print 'Function: {} from {} to {}'.format(function, function_start, function_end)
    print 'Total series: {}'.format(total_series)
    print 'Items described: {}'.format(total_described)
    print 'Items digitised: {} ({:.2f}%)'.format(total_digitised, (float(total_digitised)/total_described)*100)
    print 'Series with no items described: {} ({:.2f}%)'.format(total_undescribed, (float(total_undescribed)/total_series)*100)
    print 'Quantity undescribed: at least {} of {} metres ({:.2f}%)'.format(quantity_undescribed, quantity_described, (float(quantity_undescribed)/quantity_described)*100)
def write_agency_csv(function, agency):
    dbclient = MongoClient(MONGOLAB_URL)
    db = dbclient.get_default_database()
    func = db.functions.find_one({'function': function})
    for ag in func['agencies']:
        if ag['agency_id'] == agency:
            filename = 'data/{}-{}-{}-{}.csv'.format(func['function'].lower().replace(' ', '_'), agency.replace(' ', '_'), convert_date_to_iso(ag['function_start']), convert_date_to_iso(ag['function_end']))
            with open(filename, 'wb') as agency_file:
                agency_csv = csv.writer(agency_file)
                agency_csv.writerow([
                    'series_id',
                    'series_title',
                    'number_described',
                    'number_digitised'
                ])
                for series in ag['series']:
                    agency_csv.writerow([
                        series['series_id'],
                        series['title'],
                        series['items_described_in_period'],
                        series['items_digitised_in_period']
                    ])