def clean_list_record(self, record): if record['lat'] == '0' or record['long'] == '0': raise SkipRecord('Got 0 for lat/long') if record['lat'] is None or record['long'] is None: raise SkipRecord('Got no value for lat/long') if (record['lat'] == '' or record['long'] == 0) and record['siteaddress'] == '': raise SkipRecord('No value found for lat/long or address') record['lat'] = float(record['lat']) record['long'] = float(record['long']) if record['servicetypecode'] in ('METERS', ): raise SkipRecord('Skipping parking meter data') record['order_date'] = parse_date( record['serviceorderdate'].split('T')[0], '%Y-%m-%d') record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d') if record['resolutiondate']: record['resolution_date'] = parse_date( record['resolutiondate'].split('T')[0], '%Y-%m-%d') else: record['resolution_date'] = None if record['inspectiondate']: record['inspection_date'] = parse_date( record['inspectiondate'].split('T')[0], '%Y-%m-%d') else: record['inspection_date'] = None if record['serviceduedate']: record['service_due_date'] = parse_date( record['serviceduedate'].split('T')[0], '%Y-%m-%d') else: record['service_due_date'] = None record['service_notes'] = record['servicenotes'] or '' record['inspection_complete'] = (record['inspectionflag'] == 'Y') return record
def clean_list_record(self, record): if record['county'].upper() not in self.county_names: raise SkipRecord('Skipping county %s' % record['county']) record['approval_date'] = parse_date(record['approval_date'], self.date_format) if record['approval_date'] is None: raise SkipRecord('Record has no date.') return record
def clean_list_record(self, record): notes = [] notes_pats = [r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*w\/d.*)', r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*withd.*)', r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*ch\s+227\s+sec\s*5A.*)', r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*ch\s+bus\s+.*)', r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*c\/l.*)', ] # strip notes off of several cruft-prone fields for field in ['name', 'business_type', 'location']: val = record.get(field, '').strip() for pat in notes_pats: m = re.match(pat, val, re.I|re.M) if m is not None: results = m.groupdict() val = results['value'] notes.append(results['notes']) record[field] = val.strip() record['notes'] = notes record['location'] = smart_title(record['location'].strip()) record['date'] = parse_date(record['date'].strip(), '%Y-%m-%d') if (record['name'].upper(), record['location'].upper()) in BUSINESS_NAMES_TO_IGNORE: raise SkipRecord('Skipping %s (explicitly ignored)' % record['name']) if (record['location'] == ''): raise SkipRecord('Skipping %s (no location)' % record['name']) return record
def clean_list_record(self, record): # Get the first 2 digits of the UCR, or None. ucr1_prefix = record['offenseucr1'] and record[ 'offenseucr1'][:2] or None ucr2_prefix = record['offenseucr2'] and record[ 'offenseucr2'][:2] or None if ucr1_prefix in SKIPPED_UCR_PREFIXES: raise SkipRecord('Skipping record with UCR %s' % record['offenseucr1']) if ucr2_prefix in SKIPPED_UCR_PREFIXES: raise SkipRecord('Skipping record with UCR %s' % record['offenseucr2']) record['address'] = self.normalizer.normalize_address(record) # If we can't find the code in the mapping, just use the code itself # as the value. record['category'] = CATEGORY_MAPPING.get(ucr1_prefix, ucr1_prefix) record['crime_type'] = UCR_MAPPING.get(record['offenseucr1'], record['offenseucr1']) record['secondary_category'] = CATEGORY_MAPPING.get( ucr2_prefix, ucr2_prefix) record['secondary_crime_type'] = UCR_MAPPING.get( record['offenseucr2'], record['offenseucr2']) record['offensedate'] = parse_date(record['offensedate'], '%m/%d/%Y') record['offensestarttime'] = datetime.time( *time.strptime(record['offensestarttime'], '%H:%M:%S')[3:5]) record['offensebeat'] = record['offensebeat'] or '' return record
def clean_list_record(self, record): if record['WORKDESC'] == '4635': raise SkipRecord('WORKDESC is 4635') if record['JOBLOCATION'] is None: raise SkipRecord('Record has no location') if record['PERMITAPPROVALS'] is None: record['permit_types'] = [] else: record['permit_types'] = [ self.clean_permit_type(pa) for pa in record['PERMITAPPROVALS'].split(',') ] # Addresses and extra data generally seem to be separated by 2 or more spaces record['location'] = record['JOBLOCATION'].split(' ')[0] for format in ['%d-%b-%y', '%m/%d/%Y']: try: issue_date = parse_date(record['ISSUEDATE'], format) break except ValueError: continue record['issue_date'] = issue_date record['WORKDESC'] = record['WORKDESC'].strip() record['SUBDESC'] = record['SUBDESC'].strip() return record
def clean_list_record(self, record): record['address'] = '%s %s, %s' % (record['House #'], smart_title(record['Street Name']), smart_title(record['Borough'])) record['is_landmark'] = record['Landmark'] == 'Y' record['is_adult_establishment'] = record['Adult Estab'] == 'Y' record['is_city_owned'] = record['City Owned'] == 'Y' record['illumination_type'] = record.get( 'Sign Illumination Type', 'Not available') or 'Not illuminated' if not isinstance(record['Latest Action Date'], datetime.datetime): self.logger.info('Skipping job #%s, with latest action date %s', record.get('Job #'), record['Latest Action Date']) raise SkipRecord() try: record['sign_text'] = record['Text on Sign'].strip() if len(record['sign_text']) > 255: # Some records are malformed and have a bad and long value # for sign text. self.logger.info('Skipping job #%s, with Text on Sign %s', record.get('Job #'), record['Text on Sign']) raise SkipRecord() except AttributeError: try: record['sign_text'] = str(int(record['Text on Sign'])) except TypeError: self.logger.info('Skipping job #%s, with Text on Sign %s', record.get('Job #'), record['Text on Sign']) raise SkipRecord() try: record['sign_for'] = record['Sign Advertising'] except KeyError: record['sign_for'] = record['Usage'] try: record['is_near_highway'] = record['Sign Near Highway'] == 'Y' except KeyError: # Older spreadsheets don't have a 'Sign Near Highway' column, # and there's nothing we can do about it. They have a column called # 'Adjacent to Arterial Highway', but that's not necessarily the # same thing. record['is_near_highway'] = None try: record['is_changeable_copy'] = record[ 'Sign Changeable Copy'] == 'Y' except KeyError: # Older spreadsheets don't have a 'Sign Changeable Copy' column, # but we can deduce the value: if there's text, then it's not # changeable. Otherwise, it's NULL. if record['sign_text']: record['is_changeable_copy'] = False else: record['is_changeable_copy'] = None return record
def clean_list_record(self, rss_record): record = { 'pub_date': datetime.date(*rss_record.pop('updated_parsed')[:3]), 'summary': rss_record['summary'].strip(), } if re.search(r'^(?i)\*UPDATE:', record['summary']): m = re.search( r'^\*UPDATE:\s*(?P<location_name>[^\*]*)\*\s*(?P<description>.*)\s*-\s*(?P<reporter>.*?)\#\#\#$', record['summary']) if not m: self.logger.warn('Could not parse update %r' % record['summary']) raise SkipRecord('Could not parse update %r' % record['summary']) record.update(m.groupdict()) record.update({ 'is_update': True, 'incident_type': '', 'fire_station': '', 'radio_channels': '', 'incident_time': '', }) else: # Not an update m = re.search( r'^\*(?P<incident_type>[^\*]*)\*\s*(?P<location_name>[^;]*);\s*MAP (?:\d+[- ]\w\d)?;\s*FS (?P<fire_station>\d+); (?P<description>.*?); Ch:(?P<radio_channels>[\d, ]+)\s*@(?P<incident_time>\d\d?:\d\d [AP]M)?\s*-(?P<reporter>.*?)\#\#\#$', record['summary']) if not m: raise SkipRecord('Could not parse %r' % record['summary']) record.update(m.groupdict()) record['incident_type'] = record['incident_type'].upper( ) # Normalize record['radio_channels'] = ','.join( record['radio_channels'].split(',')) record['is_update'] = False record['description'] = record['description'].replace( ' ', ' ').replace('"', '"').replace('&', '&').strip() record['location_name'] = record['location_name'].strip() # Get the incident ID and message ID from the Google Groups URL. # We'll use these as unique identifiers. m = re.search( r'browse_thread/thread/(?P<incident_id>[^/]*)/(?P<message_id>[^\?]*)\?', rss_record['link']) if not m: raise ScraperBroken('Got weird URL: %r', rss_record['link']) record.update(m.groupdict()) record['link'] = rss_record['link'] # I can't figure out why this record is causing errors, so for now # we'll just skip it. if record['message_id'] == '0faabeab3aad8492': raise SkipRecord() return record
def clean_list_record(self, record): if record['lat'] == '0' or record['long'] == '0': raise SkipRecord('Got 0 for lat/long') record['lat'] = float(record['lat']) record['long'] = float(record['long']) record['location_name'] = '%s %s, from %s to %s' % ( record['street'], record['quadrant'], record['fromintersection'], record['tointersection']) # record['order_date'] = parse_date(record['serviceorderdate'].split('T')[0], '%Y-%m-%d') # record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d') if record['estimatedcompletiondate']: record['estimated_completion_date'] = parse_date( record['estimatedcompletiondate'].split('T')[0], '%Y-%m-%d') else: record['estimated_completion_date'] = None if record['actualcompletiondate']: record['actual_completion_date'] = parse_date( record['actualcompletiondate'].split('T')[0], '%Y-%m-%d') else: record['actual_completion_date'] = None if record['estimatedstartdate']: record['estimated_start_date'] = parse_date( record['estimatedstartdate'].split('T')[0], '%Y-%m-%d') else: record['estimated_start_date'] = None record['project_name'] = record['projectname'] or '' record['road_type'] = record['functionalclass'] or 'N/A' return record
def clean_list_record(self, record): # clean up a record dict venue = record.get('venue', {}) if not venue: raise SkipRecord("No venue") location_name_parts = [ venue.get(key, '').strip() for key in ('address_1', 'address_2', 'city', 'state', 'zip') ] location_name = ', '.join([p for p in location_name_parts if p]) event_time = datetime.datetime.fromtimestamp(record['time'] / 1000.0, local_tz) cleaned = { 'title': text_from_html(record['name']), 'description': text_from_html(record.get('description', '')), 'location_name': location_name, 'location': Point(venue['lon'], venue['lat']), 'url': record['event_url'], 'item_date': event_time.date(), } attributes = { 'venue_phone': venue.get('phone', ''), 'venue_name': text_from_html(venue.get('name', '')), 'start_time': event_time.time(), 'group_name': record['group']['name'], } cleaned['_attributes'] = attributes return cleaned
def clean_list_record(self, record): if record['county'].upper().strip() not in self.counties: raise SkipRecord('Record not in %s.' % self.counties) record['activity_date'] = parse_date(record['activity_date'], '%m/%d/%Y') record['dba'] = smart_title(record['dba']) record['address'] = clean_address(record['address']) return record
def clean_list_record(self, record): record = super(Scraper, self).clean_list_record(record) # Special-case: a bad record identified on 2009-02-11. if str(record['inspection_visit_id']) == '3145783' and str(record['license_id']) == '2164597': raise SkipRecord('Invalid record') return record
def clean_list_record(self, record): record['location'] = self.get_location(record['PERMIT_NUMBER']) if record['location'] is None: raise SkipRecord("No permit found for '%s'" % record['PERMIT_NUMBER']) record['InspectionDate'] = parse_date(record['InspectionDate'], '%Y-%m-%dT00:00:00') return record
def clean_list_record(self, record): record['name'] = record['name'].strip() record['business_type'] = record['business_type'].strip() record['location'] = smart_title(record['location'].strip()) record['date'] = parse_date(record['date'].strip(), '%Y-%m-%d') if (record['name'].upper(), record['location'].upper()) in BUSINESS_NAMES_TO_IGNORE: raise SkipRecord('Skipping %s' % record['name']) return record
def clean_list_record(self, record): if record['lat'] == '0' or record['long'] == '0': raise SkipRecord('Got 0 for lat/long') record['lat'] = float(record['lat']) record['long'] = float(record['long']) record['report_date'] = parse_date(record['reportdatetime'].split('T')[0], '%Y-%m-%d') record['address'] = record['blocksiteaddress'].replace(' B/O ', ' block of ') if record['narrative'].upper() == 'NO NARRATIVE IS AVAILABLE.': record['narrative'] == 'Not available' return record
def normalize_address(self, record): """ Addresses are provided with no spaces, so try to find the suffix if there is one, then compare the reamining part to the streets table. """ if record['offensestreet'] is None: raise SkipRecord('Skipping record with no street') street = record['offensestreet'] matching_suffix = '' suffix_groups = [('EXPWY', 'PKWY', 'FRWY', 'BLVD'), ('HWY', 'FRW', 'AVE', 'CIR', 'EXT', 'BLV', 'PKW', 'ROW', 'WAY', 'EXP'), ('DR', 'ST', 'RD', 'LN', 'BL', 'TR', 'WY', 'CT', 'PL', 'AV', 'CI'), ('P', 'R', 'F', 'D', 'S', 'L')] match_found = False for group in suffix_groups: if match_found: break for suffix in group: if record['offensestreet'].endswith(suffix): street_name = record['offensestreet'][:-len(suffix)] # Try looking up the street name from a dictionary mapping # collapsed street names to names in the streets table. try: street = self.streets[street_name] matching_suffix = suffix match_found = True break except KeyError: # SAINT is encoded as ST in the data, but Saint in the streets table, # so try that if the address starts with ST. if street_name.startswith('ST'): street_name = 'SAINT%s' % street_name[2:] try: street = self.streets[street_name] matching_suffix = suffix match_found = True break except KeyError: continue if match_found: self.matches_found += 1 self.records_seen += 1 normalized_block = record['offenseblock'].lstrip('0') if normalized_block[-2:] == 'xx': normalized_block = normalized_block.replace('xx', '00') address = '%s %s %s %s' % (normalized_block, record['offensedirection'] or '', street, matching_suffix) address = re.sub(r'\s+', ' ', address) return address_to_block(address)
def clean_list_record(self, record): record['address'] = '%s %s, %s' % (record['House #'], smart_title(record['Street Name']), smart_title(record['Borough'])) record['is_landmark'] = record['Landmarked'] == 'Y' record['is_adult_establishment'] = record['Adult Estab'] == 'Y' record['is_city_owned'] = record['City Owned'] == 'Y' if not isinstance(record['Latest Action Date'], (datetime.date, datetime.datetime)): raise SkipRecord('Got last action date %s' % record['Latest Action Date']) return record
def clean_list_record(self, record): try: dt = strptime(record['date'].strip(), '%m/%d/%Y %I:%M:%S %p') record['incident_date'] = datetime.date(*dt[:3]) record['incident_time'] = datetime.time(*dt[3:6]) except ValueError: dt = strptime(record['date'].strip(), '%m/%d/%Y') record['incident_date'] = datetime.date(*dt[:3]) record['incident_time'] = None record['units'] = record['units'].split() if 'mutual aid' in record['type'].lower(): raise SkipRecord('Skipping mutual aid: %r' % record['type']) return record
def clean_list_record(self, record): for k, v in record.items(): v = strip_tags(v) record[k] = re.sub(r'(?s)\s\s+', ' ', v).strip() # Remove the "Suite/Apt" or "Floor" clause from the address, if it exists. record['address'] = record['address'].replace(' ,', ',') m = re.search(r'^(.*?), (?:Suite/Apt|Floor):.*$', record['address']) if m: record['address'] = m.group(1) record['address'] = clean_address(record['address']) if record['dba'] in SKIPPED_DBAS: raise SkipRecord('Skipping %r' % record['dba']) # For privacy reasons, skip individuals. if record['structure'].upper().strip() == 'INDIVIDUAL': raise SkipRecord('Skipping structure=individual') record['city_id'] = int(record['city_id']) record['site_id'] = int(record['site_id']) return record
def clean_list_record(self, record): record['application_date'] = parse_date(record['application_date'], '%m/%d/%Y') try: record['review_date'] = parse_date(record['review_date'], '%m/%d/%Y') except ValueError: # sometimes it's 'n/a' record['review_date'] = None record['address'] = strip_unit(clean_address(record['address'])) if record['city'] not in self.city_names: raise SkipRecord('Skipping city %s' % record['city']) record['city'] = record['city'].title() return record
def parse_detail(self, page, list_record): detail_record = [] for m in self.parse_detail_re.finditer(page): record = m.groupdict() if record['inspection_type'] in INSPECTION_TYPES_TO_IGNORE: raise SkipRecord('Got ignorable inspection type %r' % record['inspection_type']) html = record.pop('html') record['violations'] = [] for vm in self.parse_violations_re.finditer(html): record['violations'].append(vm.group(1)) detail_record.append(record) return detail_record
def clean_list_record(self, record): # The facility type is determied by the 6th and 7th digits in the # facility ID. facility_type = record['FAC_ID'][5:7] if facility_type in EXCLUDED_FACILITY_TYPES: raise SkipRecord('Excluding record from facility type %s' % facility_type) record['DATE'] = parse_date(record['DATE']).date() record['FAC_NAME'] = record['FAC_NAME'].decode('Latin-1') record['FIN_SCORE'] = float(record['FIN_SCORE']) + float( record['RAW_SCORE']) record['schema_slug'] = self.get_schema_slug(record) record['facility_type'] = self.clean_facility_type(facility_type) record['result'] = self.get_result(record) return record
def clean_list_record(self, record): record.title = convert_entities(record['title']) record.description = convert_entities(record['description']) # Don't know why, but some feeds have 'id' *instead* of 'link'. if record.get('id', '').startswith('http'): record['link'] = record['id'] # This tries GeoRSS, RDF Geo, xCal, ... point, location_name = self.get_point_and_location_name(record) _short_title = record['title'][:30] + '...' if not point: raise SkipRecord("couldn't geocode any addresses in item '%s...'" % _short_title) if not location_name: raise SkipRecord( "Skip, no location name and failed to reverse geocode %s for %r" % (point.wkt, _short_title)) if not intersects_metro_bbox(point): # Check if latitude, longitude seem to be reversed; I've # seen that in some bad feeds! reversed_loc = Point(point.y, point.x) if intersects_metro_bbox(reversed_loc): self.logger.info( "Got points in apparently reverse order, flipping them") point = reversed_loc else: raise SkipRecord("Skipping %r as %s,%s is out of bounds" % (_short_title, point.y, point.x)) record['location_name'] = location_name record['location'] = point return record
def save(self, old_record, list_record, detail_record): attributes = list_record.pop('attributes', {}) list_record.setdefault('schema', self.schema.id) if not old_record: list_record.setdefault('item_date', today()) list_record.setdefault('pub_date', now()) from ebpub.db.forms import NewsItemForm form = NewsItemForm(list_record, instance=old_record) if form.is_valid(): return self.create_or_update(old_record, attributes, **form.cleaned_data) else: self.logger.info("Skipping due to validation failures:") for key, val in form.errors.items(): self.logger.info("%s: %s" % (key, val.as_text())) raise SkipRecord(form.errors)
def clean_list_record(self, record): record['waiver_date'] = parse_date(record['waiver_date'], '%m/%d/%y') record['address'] = ( '%s %s %s' % (record.pop('street_number', '').strip(), record.pop('street_name', '').strip(), record.pop('street_suffix', '').strip())).strip() try: record['borough'] = { 'BK': 'Brooklyn', 'BX': 'The Bronx', 'MN': 'Manhattan', 'QS': 'Queens', 'SI': 'Staten Island', }[record['borough']] except KeyError: raise SkipRecord('Invalid borough') return record
def clean_list_record(self, record): for k, v in record.items(): record[k] = strip_unneeded_tags(v) if not record['street_name']: raise SkipRecord('Street name not found') record['start_date'] = parse_date(record['start_date'], '%m/%d/%y') record['end_date'] = parse_date(record['end_date'], '%m/%d/%y') record['date_posted'] = parse_date( '%s_%s' % (record.pop('update_date'), record.pop('update_time')), '%m/%d/%Y_%I:%M %p', return_datetime=True) record['address'] = '%s-%s %s %s %s' % ( record['block_from'], record['block_to'], record['street_dir'], record['street_name'], record['street_suffix']) record['details'] = re.sub(r'(?i)<br>', '\n', record['details']).replace(' ', ' ').strip() return record
def clean_list_record(self, record): # clean up a record dict # Item date, in timezone of the photo owner. # Not sure how to determine what that is, so we'll leave it. cleaned = {} cleaned['item_date'] = datetime.datetime.strptime( record['datetaken'], '%Y-%m-%d %H:%M:%S') cleaned['item_date'] = cleaned['item_date'].date() # Posted date, UTC timestamp. pub_date = datetime.datetime.fromtimestamp(float(record['dateupload']), utc) cleaned['pub_date'] = pub_date.astimezone(local_tz) description = record['description']['_content'] cleaned['description'] = convert_entities(description.strip()) cleaned['title'] = convert_entities(record['title']) x, y = record['longitude'], record['latitude'] cleaned['location'] = Point((float(x), float(y))) # Possibly we could figure out flickr's geo API and resolve # the photo's place_id and/or woeid to the place name? But # those are probably not specific enough; reverse-geocode # instead. try: block, distance = reverse_geocode(cleaned['location']) cleaned['location_name'] = block.pretty_name except ReverseGeocodeError: raise SkipRecord("Could not geocode location %s, %s" % (x, y)) # Don't think any of the urls returned by the API's "extras" # correspond to the page? not sure. cleaned[ 'url'] = 'http://www.flickr.com/photos/%(owner)s/%(id)s' % record attributes = {} attributes['sourcename'] = 'Flickr' #attributes['photo_id'] = record['id'] attributes['user_id'] = record['owner'] attributes['username'] = record['ownername'] # Thumbnail. 'Small square' photos are 75x75. attributes['photo_href'] = record['url_sq'] cleaned['_attributes'] = attributes return cleaned
def clean_list_record(self, record): if record['last_inspection_date'].lower() == 'not available': raise SkipRecord('No inspection available') else: record['last_inspection_date'] = parse_date( record['last_inspection_date'], '%m/%d/%Y') if record['aka']: record['aka'] = list_aka_re.findall(record['aka'])[0] else: record['aka'] = '' norm_dict_space(record, 'name', 'dba', 'address') record['result'] = record['result'].replace(' ', '').strip() record['city_id'] = int(record['city_id']) # Remove the trailing ZIP code from the address, if it exists. m = re.search(r'(.*?)\s+\d\d\d\d\d$', record['address']) if m: record['address'] = m.group(1) record['address'] = clean_address(record['address']) return record
def clean_list_record(self, record): record['DateOfInspection'] = parse_date(record['DateOfInspection'], '%Y-%m-%d') if not record['DateOfInspection']: raise SkipRecord('Inspection date not given') record['Total Time'] = record['Total Time'].strip() or None record['EmployeeIDofInspector'] = record['EmployeeIDofInspector'].strip() or None # Calculate the score range. record['Score'] = record['Score'].strip() or None if record['Score']: try: record['Score'] = int(record['Score']) except ValueError: raise SkipRecord('Got sketchy non-integer score %r' % record['Score']) if record['Score'] >= 91: record['score_range'] = '91-100' elif record['Score'] >= 81: record['score_range'] = '81-90' elif record['Score'] >= 71: record['score_range'] = '71-80' elif record['Score'] >= 61: record['score_range'] = '61-70' elif record['Score'] >= 51: record['score_range'] = '51-60' elif record['Score'] >= 41: record['score_range'] = '41-50' elif record['Score'] >= 31: record['score_range'] = '31-40' elif record['Score'] >= 21: record['score_range'] = '21-30' elif record['Score'] >= 11: record['score_range'] = '11-20' else: record['score_range'] = '0-10' else: record['score_range'] = 'N/A' # Get the location data from the locations table. try: loc = self.locations()[record['LocationIDInspected']] except KeyError: raise SkipRecord('Location not found') # Explicitly convert to a Unicode object using the utf8 codec, because # this might contain funky characters. record['restaurant_dba'] = loc['DBA'].decode('utf8').strip() record['restaurant_type'] = loc['Type Description'].strip() record['address'] = loc['StreetAddress'].decode('utf8').strip() record['LocationIDInspected'] = int(record['LocationIDInspected']) # Get the inspection type data from the inspection types table. try: inspection_type = self.inspection_types()[record['InspectionTypeID']] except KeyError: raise SkipRecord('Inspection type not found') record['inspection_type'] = inspection_type['InspectionType'] # Get the violation data from the violations table. try: vios = self.violations()[record['InspectionID']] except KeyError: record['violations'] = [] else: vio_types = self.violation_types() record['violations'] = [dict(vio, type=vio_types[vio['ViolationTypeID']]) for vio in vios] return record
def clean_list_record(self, record): if record['title'].startswith(u'Boston 24'): # We don't include the summary posts. # TODO: the 'Boston 24' tag indicates posts with aggregate # daily stats. Make a separate schema for the aggregates, # with attributes like those used in # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py. # Or maybe not: these are citywide, not by precinct. # So what would be the Location? Whole city?? self.logger.info("boston daily crime stats, we don't know how to " "handle these yet") raise SkipRecord date = datetime.date(*record['updated_parsed'][:3]) description = record['summary'] # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. full_description = record['content'][0]['value'] full_description = text_from_html(full_description) location, location_name = self.get_point_and_location_name( record, address_text=full_description) if not (location or location_name): raise SkipRecord("No location or location_name") # Get the precinct from the tags. precincts = [ 'A1', 'A15', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5' ] tags = [t['term'] for t in record['tags']] precinct = None for tag in tags: if tag in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. For now we just save it as an attribute. precinct = tag break attributes = {} if precinct: precinct = self.get_or_create_lookup('precinct', precinct, precinct) attributes['precinct'] = precinct.id else: raise SkipRecord("no precinct found in tags %r" % tags) cleaned = dict( item_date=date, location=location, location_name=location_name, title=record['title'], description=description, url=record['link'], attributes=attributes, ) return cleaned
def clean_list_record(self, record): if record['primary_status'] == 'Acknowledged': raise SkipRecord('Status is "Acknowledged"') return super(ConvertedCondominiumsScraper, self).clean_list_record(record)