def clean_list_record(self, record): record['CALL_DATE'] = parse_date(record['CALL_DATE'], '%Y-%m-%d') record['OFFENSE_DATE'] = parse_date(record['OFFENSE_DATE'], '%Y-%m-%d') record['REPORT_DATE'] = parse_date(record['REPORT_DATE'], '%Y-%m-%d') record['COMMON_LOCATION'] = record['COMMON_LOCATION'].strip() record['ADDRESS_NBR'] = record['ADDRESS_NBR'].strip() record['ADDRESS'] = record['ADDRESS'].strip() # The 'NARRATIVE' field includes time and disposition data. Parse that out. m = re.search( r'^Time: (?P<TIME>\d\d?:\d\d)<br>.*?<br>Disposition: (?P<DISPOSITION>.*)$', record.pop('NARRATIVE')) record.update(m.groupdict()) record['TIME'] = parse_time(record['TIME'], '%H:%M') # Set location_name. The logic is different depending on the ADDRESS_TYPE. address_type = record['ADDRESS_TYPE'] if address_type == 'PREMISE ADDRESS': record['location_name'] = '%s block of %s' % ( record['ADDRESS_NBR'], clean_address(record['ADDRESS'])) elif address_type == 'INTERSECTION': if '/' in record['ADDRESS']: streets = record['ADDRESS'].split('/') record['location_name'] = '%s and %s' % (clean_address( streets[0]), clean_address(streets[1])) else: record['location_name'] = clean_address(record['ADDRESS']) elif address_type == 'GEO-OVERRIDE': record['location_name'] = clean_address(record['ADDRESS']) elif address_type == 'COMMON LOCATION': if record['ADDRESS_NBR'] and record['ADDRESS']: record['location_name'] = '%s %s' % ( record['ADDRESS_NBR'], clean_address(record['ADDRESS'])) elif record['ADDRESS'] and record['COMMON_LOCATION']: record['location_name'] = '%s (%s)' % (clean_address( record['ADDRESS']), clean_address( record['COMMON_LOCATION'])) elif record['COMMON_LOCATION']: record['location_name'] = clean_address( record['COMMON_LOCATION']) elif record['ADDRESS']: record['location_name'] = clean_address(record['ADDRESS']) else: record['location_name'] = 'Unknown' else: record['location_name'] = 'Unknown' try: d = CUSTOM_CATEGORIES[record['ORIG_CRIMETYPE_NAME']] except KeyError: d = ('Unknown', 'Unknown') record['broad_category'], record['detail_category'] = d return record
def clean_list_record(self, record): strip_dict(record) try: record['filing_date'] = parse_date(str(int(record['filing_dat'])), '%m%d%y') except ValueError: record['filing_date'] = None if record['filing_date'] is None: self.logger.info('Skipping invalid filing date %r', record['filing_dat']) raise SkipRecord record['address'] = clean_address(record.pop('address')) record['case_number'] = record.pop('case_#') record['document_number'] = record.pop('document_#') record['pin_number'] = record.pop('pin_number') try: record['year_of_mortgage'] = str(record.pop('year_of_mo').year) except AttributeError: record['year_of_mortgage'] = 'Unknown' # Normalize inconsistent headers for old, new in (('SF', 'sf'), ('SMF', 'smf'), ('Condo', 'condo')): try: record[new] = record.pop(old) except KeyError: pass if int(record['sf']): record['property_type'] = 'Single family' elif int(record['smf']): record['property_type'] = 'Multi-unit' elif int(record['condo']): record['property_type'] = 'Condo' else: record['property_type'] = 'Unknown' return record
def clean_list_record(self, record): # Collapse the violations into a single value, rather than 58 values, # most of which are zero. num_violations = [] for i in range(1, 59): val = int(record.pop('vio%s' % i)) if val: num_violations.append((str(i), val)) record['violations'] = num_violations record['inspection_date'] = parse_date(record['inspection_date'], '%m/%d/%Y') record['address'] = clean_address(record['address']) if record['city'] not in self.city_names: raise SkipRecord('Skipping city %s' % record['city']) record['city'] = record['city'].title() record['visit_number'] = int(record['visit_number']) record['critical_violations'] = int(record['critical_violations']) record['noncritical_violations'] = int( record['noncritical_violations']) record['total_violations'] = int(record['total_violations']) record['inspection_number'] = int(record['inspection_number']) return record
def save(self, old_record, list_record, detail_record): for record in detail_record: # Since parse_detail emits more than one record, we check for existing # records here rather than in self.existing_record() try: qs = NewsItem.objects.filter(schema__id=self.schema.id, item_date=record['inspection_date']) obj = qs.by_attribute(self.schema_fields['facility_id'], list_record['facid'])[0] except IndexError: pass else: return None inspection_type_lookup = self.get_or_create_lookup('inspection_type', record['inspection_type'], record['inspection_type'], make_text_slug=False) violations_lookups = [] for violation in record['violations']: lookup = self.get_or_create_lookup('violations', violation, violation, make_text_slug=False) violations_lookups.append(lookup) attributes = { 'name': list_record['name'], 'inspection_type': inspection_type_lookup.id, 'violations': ','.join([str(l.id) for l in violations_lookups]), 'facility_id': list_record['facid'], } self.create_newsitem( attributes, title=smart_title(list_record['name']), url=self.detail_uri % list_record['facid'], item_date=record['inspection_date'], location_name=clean_address(list_record['location']) )
def clean_list_record(self, record): if record['county'].upper().strip() not in self.counties: raise SkipRecord('Record not in %s.' % self.counties) record['activity_date'] = parse_date(record['activity_date'], '%m/%d/%Y') record['dba'] = smart_title(record['dba']) record['address'] = clean_address(record['address']) return record
def clean_list_record(self, record): record['CALL_DATE'] = parse_date(record['CALL_DATE'], '%Y-%m-%d') record['OFFENSE_DATE'] = parse_date(record['OFFENSE_DATE'], '%Y-%m-%d') record['REPORT_DATE'] = parse_date(record['REPORT_DATE'], '%Y-%m-%d') record['COMMON_LOCATION'] = record['COMMON_LOCATION'].strip() record['ADDRESS_NBR'] = record['ADDRESS_NBR'].strip() record['ADDRESS'] = record['ADDRESS'].strip() # The 'NARRATIVE' field includes time and disposition data. Parse that out. m = re.search(r'^Time: (?P<TIME>\d\d?:\d\d)<br>.*?<br>Disposition: (?P<DISPOSITION>.*)$', record.pop('NARRATIVE')) record.update(m.groupdict()) record['TIME'] = parse_time(record['TIME'], '%H:%M') # Set location_name. The logic is different depending on the ADDRESS_TYPE. address_type = record['ADDRESS_TYPE'] if address_type == 'PREMISE ADDRESS': record['location_name'] = '%s block of %s' % (record['ADDRESS_NBR'], clean_address(record['ADDRESS'])) elif address_type == 'INTERSECTION': if '/' in record['ADDRESS']: streets = record['ADDRESS'].split('/') record['location_name'] = '%s and %s' % (clean_address(streets[0]), clean_address(streets[1])) else: record['location_name'] = clean_address(record['ADDRESS']) elif address_type == 'GEO-OVERRIDE': record['location_name'] = clean_address(record['ADDRESS']) elif address_type == 'COMMON LOCATION': if record['ADDRESS_NBR'] and record['ADDRESS']: record['location_name'] = '%s %s' % (record['ADDRESS_NBR'], clean_address(record['ADDRESS'])) elif record['ADDRESS'] and record['COMMON_LOCATION']: record['location_name'] = '%s (%s)' % (clean_address(record['ADDRESS']), clean_address(record['COMMON_LOCATION'])) elif record['COMMON_LOCATION']: record['location_name'] = clean_address(record['COMMON_LOCATION']) elif record['ADDRESS']: record['location_name'] = clean_address(record['ADDRESS']) else: record['location_name'] = 'Unknown' else: record['location_name'] = 'Unknown' try: d = CUSTOM_CATEGORIES[record['ORIG_CRIMETYPE_NAME']] except KeyError: d = ('Unknown', 'Unknown') record['broad_category'], record['detail_category'] = d return record
def save(self, old_record, list_record, detail_record): values = { 'title': self.get_title(list_record), 'item_date': list_record['approval_date'], 'location_name': clean_address('%s, %s' % (list_record['street'].strip(), list_record['city'])), } attributes = self.get_attributes(list_record) if old_record is None: self.create_newsitem(attributes, **values) else: self.update_existing(old_record, values, attributes)
def save(self, old_record, list_record, detail_record): values = { "title": self.get_title(list_record), "item_date": list_record["approval_date"], "location_name": clean_address("%s, %s" % (list_record["street"].strip(), list_record["city"])), } attributes = self.get_attributes(list_record) if old_record is None: self.create_newsitem(attributes, **values) else: self.update_existing(old_record, values, attributes)
def clean_list_record(self, record): record['application_date'] = parse_date(record['application_date'], '%m/%d/%Y') try: record['review_date'] = parse_date(record['review_date'], '%m/%d/%Y') except ValueError: # sometimes it's 'n/a' record['review_date'] = None record['address'] = strip_unit(clean_address(record['address'])) if record['city'] not in self.city_names: raise SkipRecord('Skipping city %s' % record['city']) record['city'] = record['city'].title() return record
def save(self, old_record, list_record, detail_record): address = clean_address(list_record['address']) attributes = { 'place_name': list_record.get('place_name', ''), 'rack_id': list_record['rack_id'], 'rack_count': list_record['rack_count'] } values = { 'title': 'Bike rack installed near %s' % list_record.get('place_name', address), 'item_date': list_record['installation_date'], 'location_name': address, 'url': list_record['url'] } if old_record is None: self.create_newsitem(attributes, **values) else: self.update_existing(old_record, values, attributes)
def clean_list_record(self, record): # Save the raw address so we can use it to find duplicate records in # the future. address = smart_title(record['address'].strip().replace('&', '&').replace(' ', ' ')).strip() record['raw_address'] = address record['address'] = address_to_block(clean_address(address)) record['disposition'] = record['disposition'].replace('&', '&').replace(' ', ' ').strip() or 'Not available' record['event'] = record['event'].replace('&', '&').replace(' ', ' ').strip() item_date = parse_date(record['datetime'], '%m/%d/%Y %I:%M:%S %p', return_datetime=True) record['item_date'] = item_date.date() record['item_time'] = item_date.time() # Normalize this value. if record['disposition'] == 'CANCCOMM': record['disposition'] = 'CANCELLED BY COMMUNICATIONS' return record
def clean_list_record(self, record): if record['last_inspection_date'].lower() == 'not available': raise SkipRecord('No inspection available') else: record['last_inspection_date'] = parse_date(record['last_inspection_date'], '%m/%d/%Y') if record['aka']: record['aka'] = list_aka_re.findall(record['aka'])[0] else: record['aka'] = '' norm_dict_space(record, 'name', 'dba', 'address') record['result'] = record['result'].replace(' ', '').strip() record['city_id'] = int(record['city_id']) # Remove the trailing ZIP code from the address, if it exists. m = re.search(r'(.*?)\s+\d\d\d\d\d$', record['address']) if m: record['address'] = m.group(1) record['address'] = clean_address(record['address']) return record
def clean_list_record(self, record): if record["City"].strip().upper() != "CHICAGO": raise SkipRecord if record["Amount"].strip().upper() == "UNKNOWN": record["Amount"] = None else: record["Amount"] = record["Amount"].replace(".00", "").replace("$", "").replace(",", "") record["Executed"] = parse_date(record["Executed"], "%m/%d/%Y") record["Recorded"] = parse_date(record["Recorded"], "%m/%d/%Y") record["clean_address"] = clean_address(record["Address"]) unit = record["Unit #"] not in ("", "MANY") and record["Unit #"] or None record["clean_address_with_unit"] = "%s%s" % (record["clean_address"], (unit and ", unit " + unit or "")) try: record["doc_number"] = record["Doc Number"] except KeyError: record["doc_number"] = record["Doc #"] return record
def clean_list_record(self, record): if record['City'].strip().upper() != 'CHICAGO': raise SkipRecord if record['Amount'].strip().upper() == 'UNKNOWN': record['Amount'] = None else: record['Amount'] = record['Amount'].replace('.00', '').replace('$', '').replace(',', '') record['Executed'] = parse_date(record['Executed'], '%m/%d/%Y') record['Recorded'] = parse_date(record['Recorded'], '%m/%d/%Y') record['clean_address'] = clean_address(record['Address']) unit = record['Unit #'] not in ('', 'MANY') and record['Unit #'] or None record['clean_address_with_unit'] = '%s%s' % (record['clean_address'], (unit and ', unit ' + unit or '')) try: record['doc_number'] = record['Doc Number'] except KeyError: record['doc_number'] = record['Doc #'] return record
def clean_list_record(self, record): if record['last_inspection_date'].lower() == 'not available': raise SkipRecord('No inspection available') else: record['last_inspection_date'] = parse_date( record['last_inspection_date'], '%m/%d/%Y') if record['aka']: record['aka'] = list_aka_re.findall(record['aka'])[0] else: record['aka'] = '' norm_dict_space(record, 'name', 'dba', 'address') record['result'] = record['result'].replace(' ', '').strip() record['city_id'] = int(record['city_id']) # Remove the trailing ZIP code from the address, if it exists. m = re.search(r'(.*?)\s+\d\d\d\d\d$', record['address']) if m: record['address'] = m.group(1) record['address'] = clean_address(record['address']) return record
def save(self, old_record, list_record, detail_record): for record in detail_record: # Since parse_detail emits more than one record, we check for existing # records here rather than in self.existing_record() try: qs = NewsItem.objects.filter( schema__id=self.schema.id, item_date=record['inspection_date']) obj = qs.by_attribute(self.schema_fields['facility_id'], list_record['facid'])[0] except IndexError: pass else: return None inspection_type_lookup = self.get_or_create_lookup( 'inspection_type', record['inspection_type'], record['inspection_type'], make_text_slug=False) violations_lookups = [] for violation in record['violations']: lookup = self.get_or_create_lookup('violations', violation, violation, make_text_slug=False) violations_lookups.append(lookup) attributes = { 'name': list_record['name'], 'inspection_type': inspection_type_lookup.id, 'violations': ','.join([str(l.id) for l in violations_lookups]), 'facility_id': list_record['facid'], } self.create_newsitem(attributes, title=smart_title(list_record['name']), url=self.detail_uri % list_record['facid'], item_date=record['inspection_date'], location_name=clean_address( list_record['location']))
def clean_list_record(self, record): for k, v in record.items(): v = strip_tags(v) record[k] = re.sub(r'(?s)\s\s+', ' ', v).strip() # Remove the "Suite/Apt" or "Floor" clause from the address, if it exists. record['address'] = record['address'].replace(' ,', ',') m = re.search(r'^(.*?), (?:Suite/Apt|Floor):.*$', record['address']) if m: record['address'] = m.group(1) record['address'] = clean_address(record['address']) if record['dba'] in SKIPPED_DBAS: raise SkipRecord('Skipping %r' % record['dba']) # For privacy reasons, skip individuals. if record['structure'].upper().strip() == 'INDIVIDUAL': raise SkipRecord('Skipping structure=individual') record['city_id'] = int(record['city_id']) record['site_id'] = int(record['site_id']) return record
def clean_list_record(self, record): if record['City'].strip().upper() != 'CHICAGO': raise SkipRecord if record['Amount'].strip().upper() == 'UNKNOWN': record['Amount'] = None else: record['Amount'] = record['Amount'].replace('.00', '').replace( '$', '').replace(',', '') record['Executed'] = parse_date(record['Executed'], '%m/%d/%Y') record['Recorded'] = parse_date(record['Recorded'], '%m/%d/%Y') record['clean_address'] = clean_address(record['Address']) unit = record['Unit #'] not in ('', 'MANY') and record['Unit #'] or None record['clean_address_with_unit'] = '%s%s' % ( record['clean_address'], (unit and ', unit ' + unit or '')) try: record['doc_number'] = record['Doc Number'] except KeyError: record['doc_number'] = record['Doc #'] return record
def clean_list_record(self, record): # Collapse the violations into a single value, rather than 58 values, # most of which are zero. num_violations = [] for i in range(1, 59): val = int(record.pop('vio%s' % i)) if val: num_violations.append((str(i), val)) record['violations'] = num_violations record['inspection_date'] = parse_date(record['inspection_date'], '%m/%d/%Y') record['address'] = clean_address(record['address']) if record['city'] not in self.city_names: raise SkipRecord('Skipping city %s' % record['city']) record['city'] = record['city'].title() record['visit_number'] = int(record['visit_number']) record['critical_violations'] = int(record['critical_violations']) record['noncritical_violations'] = int(record['noncritical_violations']) record['total_violations'] = int(record['total_violations']) record['inspection_number'] = int(record['inspection_number']) return record
def clean_washington_address(add, city): add = add.replace(u'\xa0', ' ') add = re.sub(r',\s+%s,\s+WA.*$' % city, '', add) add = clean_address(add) return add, strip_unit(add).strip()