def _daterange_filter(query, params, state): """ handles filtering by start and end date paramters: startdate, enddate """ startdate = params.get('startdate') if startdate is not None: try: del params['startdate'] try: startdate = parse_date(startdate, '%Y-%m-%d') except ValueError: startdate = pyrfc3339.parse(startdate) query = query.filter(pub_date__gte=startdate) except ValueError: raise QueryError( 'Invalid start date "%s", must be YYYY-MM-DD or rfc3339' % startdate) enddate = params.get('enddate') if enddate is not None: try: del params['enddate'] try: enddate = parse_date(enddate, '%Y-%m-%d') except ValueError: enddate = pyrfc3339.parse(enddate) query = query.filter(pub_date__lte=enddate) except ValueError: raise QueryError( 'Invalid end date "%s", must be YYYY-MM-DD or rfc3339' % enddate) return query, params, state
def clean_list_record(self, record): if record['lat'] == '0' or record['long'] == '0': raise SkipRecord('Got 0 for lat/long') record['lat'] = float(record['lat']) record['long'] = float(record['long']) record['location_name'] = '%s %s, from %s to %s' % ( record['street'], record['quadrant'], record['fromintersection'], record['tointersection']) # record['order_date'] = parse_date(record['serviceorderdate'].split('T')[0], '%Y-%m-%d') # record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d') if record['estimatedcompletiondate']: record['estimated_completion_date'] = parse_date( record['estimatedcompletiondate'].split('T')[0], '%Y-%m-%d') else: record['estimated_completion_date'] = None if record['actualcompletiondate']: record['actual_completion_date'] = parse_date( record['actualcompletiondate'].split('T')[0], '%Y-%m-%d') else: record['actual_completion_date'] = None if record['estimatedstartdate']: record['estimated_start_date'] = parse_date( record['estimatedstartdate'].split('T')[0], '%Y-%m-%d') else: record['estimated_start_date'] = None record['project_name'] = record['projectname'] or '' record['road_type'] = record['functionalclass'] or 'N/A' return record
def clean_detail_record(self, record): for i, license in enumerate(record['licenses']): license['issue_date'] = license['issue_date'] != 'null' and parse_date(license['issue_date'], '%m/%d/%Y') or None license['expiration_date'] = license['expiration_date'] != 'null' and parse_date(license['expiration_date'], '%m/%d/%Y') or None if license['issue_date'] is None: del record['licenses'][i] # If the issue date is empty, just drop it. return record
def clean_detail_record(self, record): if 'No Active DBA found' in record['business_name']: record['business_name'] = '' else: m = re.search(r'(?si)<tr><td><b>Doing Business As: </b>(.*?)</td></tr>', record['business_name']) if not m: raise ScraperBroken('Got unknown business_name value %r' % record['business_name']) record['business_name'] = m.group(1) record['address'] = record['address'].strip() # There can be multiple license types, so this requires further parsing # to create a list. license_types = [] for m in license_types_re.finditer(record['license_types']): d = m.groupdict() d['status_date'] = parse_date(d['status_date'], '%d-%b-%Y') if not d['status_date']: # Skip license types that don't have a status date, because # a NewsItem is required to have an item_date, and we don't # care about licenses that don't have a change date. continue d['original_issue_date'] = parse_date(d['original_issue_date'], '%d-%b-%Y') d['expiration_date'] = parse_date(d['expiration_date'], '%d-%b-%Y') d['term'] = d['term'].replace('</B>', '').strip() license_types.append(d) record['license_types'] = license_types return record
def clean_list_record(self, record): if record['lat'] == '0' or record['long'] == '0': raise SkipRecord('Got 0 for lat/long') if record['lat'] is None or record['long'] is None: raise SkipRecord('Got no value for lat/long') if (record['lat'] == '' or record['long'] == 0) and record['siteaddress'] == '': raise SkipRecord('No value found for lat/long or address') record['lat'] = float(record['lat']) record['long'] = float(record['long']) if record['servicetypecode'] in ('METERS',): raise SkipRecord('Skipping parking meter data') record['order_date'] = parse_date(record['serviceorderdate'].split('T')[0], '%Y-%m-%d') record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d') if record['resolutiondate']: record['resolution_date'] = parse_date(record['resolutiondate'].split('T')[0], '%Y-%m-%d') else: record['resolution_date'] = None if record['inspectiondate']: record['inspection_date'] = parse_date(record['inspectiondate'].split('T')[0], '%Y-%m-%d') else: record['inspection_date'] = None if record['serviceduedate']: record['service_due_date'] = parse_date(record['serviceduedate'].split('T')[0], '%Y-%m-%d') else: record['service_due_date'] = None record['service_notes'] = record['servicenotes'] or '' record['inspection_complete'] = (record['inspectionflag'] == 'Y') return record
def clean_list_record(self, record): if record["lat"] == "0" or record["long"] == "0": raise SkipRecord("Got 0 for lat/long") record["lat"] = float(record["lat"]) record["long"] = float(record["long"]) record["location_name"] = "%s %s, from %s to %s" % ( record["street"], record["quadrant"], record["fromintersection"], record["tointersection"], ) # record['order_date'] = parse_date(record['serviceorderdate'].split('T')[0], '%Y-%m-%d') # record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d') if record["estimatedcompletiondate"]: record["estimated_completion_date"] = parse_date( record["estimatedcompletiondate"].split("T")[0], "%Y-%m-%d" ) else: record["estimated_completion_date"] = None if record["actualcompletiondate"]: record["actual_completion_date"] = parse_date(record["actualcompletiondate"].split("T")[0], "%Y-%m-%d") else: record["actual_completion_date"] = None if record["estimatedstartdate"]: record["estimated_start_date"] = parse_date(record["estimatedstartdate"].split("T")[0], "%Y-%m-%d") else: record["estimated_start_date"] = None record["project_name"] = record["projectname"] or "" record["road_type"] = record["functionalclass"] or "N/A" return record
def clean_list_record(self, record): try: record['Date'] = parse_date(record['Date'], '%m/%d/%Y') # 12/31/2007 except ValueError: record['Date'] = parse_date(record['Date'], '%m/%d/%y') # 12/31/07 for key in ('Location', 'Notes', 'Title', 'Type'): if key in record and record[key]: record[key] = record[key].strip() else: record[key] = '' record['Location'] = smart_title(record['Location']) record['Title'] = smart_title(record['Title']) # This is temporary! The CSV files we get are inconsistent -- sometimes # they're only films and don't have a "Type" field. if record['Type'] == '': record['Type'] = 'Film' # Normalize inconsistent data. if record['Type'] in ('Stills', 'Still'): record['Type'] = 'Still photography' if record['Type'] in ('Fim', 'Movie'): record['Type'] = 'Film' return record
def _daterange_filter(query, params, state): """ handles filtering by start and end date paramters: startdate, enddate """ startdate = params.get('startdate') if startdate is not None: try: del params['startdate'] try: startdate = parse_date(startdate, '%Y-%m-%d') except ValueError: startdate = pyrfc3339.parse(startdate) query = query.filter(pub_date__gte=startdate) except ValueError: raise QueryError('Invalid start date "%s", must be YYYY-MM-DD or rfc3339' % startdate) enddate = params.get('enddate') if enddate is not None: try: del params['enddate'] try: enddate = parse_date(enddate, '%Y-%m-%d') except ValueError: enddate = pyrfc3339.parse(enddate) query = query.filter(pub_date__lte=enddate) except ValueError: raise QueryError('Invalid end date "%s", must be YYYY-MM-DD or rfc3339' % enddate) return query, params, state
def clean_list_record(self, record): if record['lat'] == '0' or record['long'] == '0': raise SkipRecord('Got 0 for lat/long') if record['lat'] is None or record['long'] is None: raise SkipRecord('Got no value for lat/long') if (record['lat'] == '' or record['long'] == 0) and record['siteaddress'] == '': raise SkipRecord('No value found for lat/long or address') record['lat'] = float(record['lat']) record['long'] = float(record['long']) if record['servicetypecode'] in ('METERS', ): raise SkipRecord('Skipping parking meter data') record['order_date'] = parse_date( record['serviceorderdate'].split('T')[0], '%Y-%m-%d') record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d') if record['resolutiondate']: record['resolution_date'] = parse_date( record['resolutiondate'].split('T')[0], '%Y-%m-%d') else: record['resolution_date'] = None if record['inspectiondate']: record['inspection_date'] = parse_date( record['inspectiondate'].split('T')[0], '%Y-%m-%d') else: record['inspection_date'] = None if record['serviceduedate']: record['service_due_date'] = parse_date( record['serviceduedate'].split('T')[0], '%Y-%m-%d') else: record['service_due_date'] = None record['service_notes'] = record['servicenotes'] or '' record['inspection_complete'] = (record['inspectionflag'] == 'Y') return record
def clean_list_record(self, record): record['start_date'] = parse_date(record['start_date'], '%m/%d/%Y') record['end_date'] = parse_date(record['end_date'], '%m/%d/%Y') crime_types = ('murder', 'rape', 'robbery', 'felony_assault', 'burglary', 'grand_larceny', 'grand_larceny_auto') for key in crime_types: record[key] = int(record[key]) record['total'] = sum([record[key] for key in crime_types]) return record
def clean_list_record(self, record): record['clean_street_name'] = smart_title(remove_leading_zero(record['streetname'])) record['clean_cross_1'] = smart_title(remove_leading_zero(record['Cross Street 1'].replace(' \ ', ' / '))) record['clean_cross_2'] = smart_title(remove_leading_zero(record['Cross Street 2'].replace(' \ ', ' / '))) record['Permit Reason'] = capfirst(record['Permit Reason'].lower()).replace('Cut off service', 'Cut-off service') record['Effective Date'] = parse_date(record['Effective Date'], '%Y-%m-%d %H:%M:%S') record['Expiration Date'] = parse_date(record['Expiration Date'], '%Y-%m-%d %H:%M:%S') return record
def clean_list_record(self, record): record['From Date'] = parse_date(record['From Date'], '%Y-%m-%d %H:%M:%S') record['To Date'] = parse_date(record['To Date'], '%Y-%m-%d %H:%M:%S') if record['From Date'] is None and isinstance(record['To Date'], datetime.date): record['From Date'] = record['To Date'] elif record['To Date'] is None and isinstance(record['From Date'], datetime.date): record['To Date'] = record['From Date'] record['Description'] = record['Description'].strip() return record
def clean_list_record(self, record): for k, v in record.items(): record[k] = strip_unneeded_tags(v) if not record['street_name']: raise SkipRecord('Street name not found') record['start_date'] = parse_date(record['start_date'], '%m/%d/%y') record['end_date'] = parse_date(record['end_date'], '%m/%d/%y') record['date_posted'] = parse_date('%s_%s' % (record.pop('update_date'), record.pop('update_time')), '%m/%d/%Y_%I:%M %p', return_datetime=True) record['address'] = '%s-%s %s %s %s' % (record['block_from'], record['block_to'], record['street_dir'], record['street_name'], record['street_suffix']) record['details'] = re.sub(r'(?i)<br>', '\n', record['details']).replace(' ', ' ').strip() return record
def clean_list_record(self, record): if record['geo_code'] != self.geo_code: raise SkipRecord record = LiquorLicenseScraper.clean_list_record(self, record) record['type'], record['dup'] = record.pop('type_dup').split('/') record['status_from'], record['status_to'] = record.pop('status_changed_from_to').split(' / ') record['transfer_info_from'], record['transfer_info_to'] = record.pop('transfer_info_from_to').split('/') record['orig_iss_date'] = parse_date(record['orig_iss_date'], '%m/%d/%Y') record['expir_date'] = parse_date(record['expir_date'], '%m/%d/%Y') return record
def clean_list_record(self, record): record['CALL_DATE'] = parse_date(record['CALL_DATE'], '%Y-%m-%d') record['OFFENSE_DATE'] = parse_date(record['OFFENSE_DATE'], '%Y-%m-%d') record['REPORT_DATE'] = parse_date(record['REPORT_DATE'], '%Y-%m-%d') record['COMMON_LOCATION'] = record['COMMON_LOCATION'].strip() record['ADDRESS_NBR'] = record['ADDRESS_NBR'].strip() record['ADDRESS'] = record['ADDRESS'].strip() # The 'NARRATIVE' field includes time and disposition data. Parse that out. m = re.search( r'^Time: (?P<TIME>\d\d?:\d\d)<br>.*?<br>Disposition: (?P<DISPOSITION>.*)$', record.pop('NARRATIVE')) record.update(m.groupdict()) record['TIME'] = parse_time(record['TIME'], '%H:%M') # Set location_name. The logic is different depending on the ADDRESS_TYPE. address_type = record['ADDRESS_TYPE'] if address_type == 'PREMISE ADDRESS': record['location_name'] = '%s block of %s' % ( record['ADDRESS_NBR'], clean_address(record['ADDRESS'])) elif address_type == 'INTERSECTION': if '/' in record['ADDRESS']: streets = record['ADDRESS'].split('/') record['location_name'] = '%s and %s' % (clean_address( streets[0]), clean_address(streets[1])) else: record['location_name'] = clean_address(record['ADDRESS']) elif address_type == 'GEO-OVERRIDE': record['location_name'] = clean_address(record['ADDRESS']) elif address_type == 'COMMON LOCATION': if record['ADDRESS_NBR'] and record['ADDRESS']: record['location_name'] = '%s %s' % ( record['ADDRESS_NBR'], clean_address(record['ADDRESS'])) elif record['ADDRESS'] and record['COMMON_LOCATION']: record['location_name'] = '%s (%s)' % (clean_address( record['ADDRESS']), clean_address( record['COMMON_LOCATION'])) elif record['COMMON_LOCATION']: record['location_name'] = clean_address( record['COMMON_LOCATION']) elif record['ADDRESS']: record['location_name'] = clean_address(record['ADDRESS']) else: record['location_name'] = 'Unknown' else: record['location_name'] = 'Unknown' try: d = CUSTOM_CATEGORIES[record['ORIG_CRIMETYPE_NAME']] except KeyError: d = ('Unknown', 'Unknown') record['broad_category'], record['detail_category'] = d return record
def clean_list_record(self, record): record['application_date'] = parse_date(record['application_date'], '%m/%d/%Y') try: record['review_date'] = parse_date(record['review_date'], '%m/%d/%Y') except ValueError: # sometimes it's 'n/a' record['review_date'] = None record['address'] = strip_unit(clean_address(record['address'])) if record['city'] not in self.city_names: raise SkipRecord('Skipping city %s' % record['city']) record['city'] = record['city'].title() return record
def clean_list_record(self, record): norm_dict_space(record, 'legal_name', 'address_raw', 'license', 'dba', 'start_date', 'application_date') record['dba'], record['business_type'] = record['dba'].rsplit(' - ', 1) record['start_date'] = parse_date(record['start_date'], '%m/%d/%Y') record['application_date'] = parse_date(record['application_date'], '%m/%d/%Y') # Remove the "Suite/Apt" or "Floor" clause from the address, if it exists. record['address_raw'] = record['address_raw'].replace(' ,', ',') m = re.search(r'^(.*?), (?:Suite/Apt|Floor):.*$', record['address_raw']) if m: record['address_raw'] = m.group(1) return record
def _parse(date): # Ugh, papering over wild proliferation of date formats. try: date = parse_date(date, '%m/%d/%Y') except ValueError: try: date = parse_date(date, '%Y/%m/%d') except ValueError: try: date = datetime.date(*map(int, date.split('-'))) except ValueError: raise BadDateException("Unknown date format on %r" % date) return date
def clean_list_record(self, record): # Get the first 2 digits of the UCR, or None. ucr1_prefix = record['offenseucr1'] and record[ 'offenseucr1'][:2] or None ucr2_prefix = record['offenseucr2'] and record[ 'offenseucr2'][:2] or None if ucr1_prefix in SKIPPED_UCR_PREFIXES: raise SkipRecord('Skipping record with UCR %s' % record['offenseucr1']) if ucr2_prefix in SKIPPED_UCR_PREFIXES: raise SkipRecord('Skipping record with UCR %s' % record['offenseucr2']) record['address'] = self.normalizer.normalize_address(record) # If we can't find the code in the mapping, just use the code itself # as the value. record['category'] = CATEGORY_MAPPING.get(ucr1_prefix, ucr1_prefix) record['crime_type'] = UCR_MAPPING.get(record['offenseucr1'], record['offenseucr1']) record['secondary_category'] = CATEGORY_MAPPING.get( ucr2_prefix, ucr2_prefix) record['secondary_crime_type'] = UCR_MAPPING.get( record['offenseucr2'], record['offenseucr2']) record['offensedate'] = parse_date(record['offensedate'], '%m/%d/%Y') record['offensestarttime'] = datetime.time( *time.strptime(record['offensestarttime'], '%H:%M:%S')[3:5]) record['offensebeat'] = record['offensebeat'] or '' return record
def clean_list_record(self, list_record): list_record['installation_date'] = parse_date(list_record['installation_date'], '%Y-%m-%d') try: list_record['rack_count'] = int(list_record['rack_count']) except TypeError: pass return list_record
def clean_list_record(self, record): if record['county'].upper() not in self.county_names: raise SkipRecord('Skipping county %s' % record['county']) record['approval_date'] = parse_date(record['approval_date'], self.date_format) if record['approval_date'] is None: raise SkipRecord('Record has no date.') return record
def clean_list_record(self, record): if record["county"].upper() not in self.county_names: raise SkipRecord("Skipping county %s" % record["county"]) record["approval_date"] = parse_date(record["approval_date"], self.date_format) if record["approval_date"] is None: raise SkipRecord("Record has no date.") return record
def clean_list_record(self, record): record['inspection_date'] = parse_date(record['inspection_date'], '%m/%d/%Y') record['address'] = smart_title(record['address']) record['restaurant_name'] = smart_title(record['restaurant_name']) record['result'] = smart_title(record['result']) return record
def clean_list_record(self, record): if record['county'].upper().strip() not in self.counties: raise SkipRecord('Record not in %s.' % self.counties) record['activity_date'] = parse_date(record['activity_date'], '%m/%d/%Y') record['dba'] = smart_title(record['dba']) record['address'] = clean_address(record['address']) return record
def parse_list(self, page): page = page.replace(' ', ' ') # First, get the report date by looking for "Report as of XXXX". m = re.search(r'(?i)report as of (\w+ \d\d?, \d\d\d\d)</U>', page) if not m: raise ScraperBroken('Could not find "Report as of" in page') report_date = parse_date(m.group(1), '%B %d, %Y') # Determine the headers by looking at the <th> tags, and clean them up # to match our style for keys in the list_record dictionary (lower # case, underscores instead of spaces). headers = [h.lower() for h in re.findall('(?i)<th[^>]*>(?:<a[^>]+>)?\s*(.*?)\s*(?:</a>)?</th>', page)] headers = [h.replace('<br>', ' ') for h in headers] headers = [re.sub(r'[^a-z]', ' ', h) for h in headers] headers = [re.sub(r'\s+', '_', h.strip()) for h in headers] # Dynamically construct a regex based on the number of headers. # Note that this assumes that at most *one* of the headers has an # empty name; if more than one header has an empty name, this regex # will have multiple named groups with the same name, which will cause # an error. pattern = '(?si)<tr valign=top class=report_column>%s</tr>'% '\s*'.join(['\s*<td[^>]*>\s*(?:<center>)?\s*(?P<%s>.*?)\s*(?:</center>)?\s*</td[^>]*>\s*' % (h or 'number') for h in headers]) for record in re.finditer(pattern, page): yield dict(record.groupdict(), report_date=report_date)
def clean_detail_record(self, record): try: record['inspection_date'] = parse_date(record['inspection_date'], '%m/%d/%Y') except KeyError: # Sometimes the inspection_date is missing, for whatever reason. # Raise a warning in this case. self.logger.info('Record %r has no inspection_date. Skipping.', record) raise SkipRecord record['violations'] = [(i[0], i[1] == 'Yes') for i in detail_violations_re.findall(record['violations'])] # Determine the notes (a textual representation of which violations, # if any, were corrected during the inspection). corrected_violations = [v[0] for v in record['violations'] if v[1]] if record['violations']: if not corrected_violations: if len(corrected_violations) == 1: note_bit = 'violation was not' else: note_bit = 'violations were not' elif len(corrected_violations) == 1: if len(record['violations']) == 1: note_bit = 'violation was' else: note_bit = '%s violation was' % corrected_violations[0] else: # Multiple corrected violations. note_bit = '%s violations were' % get_text_list(corrected_violations, 'and') notes = 'The %s corrected during the inspection.' % note_bit else: # There's no need for notes if there were no violations. notes = '' record['notes'] = notes return record
def clean_list_record(self, record): strip_dict(record) try: record['filing_date'] = parse_date(str(int(record['filing_dat'])), '%m%d%y') except ValueError: record['filing_date'] = None if record['filing_date'] is None: self.logger.info('Skipping invalid filing date %r', record['filing_dat']) raise SkipRecord record['address'] = clean_address(record.pop('address')) record['case_number'] = record.pop('case_#') record['document_number'] = record.pop('document_#') record['pin_number'] = record.pop('pin_number') try: record['year_of_mortgage'] = str(record.pop('year_of_mo').year) except AttributeError: record['year_of_mortgage'] = 'Unknown' # Normalize inconsistent headers for old, new in (('SF', 'sf'), ('SMF', 'smf'), ('Condo', 'condo')): try: record[new] = record.pop(old) except KeyError: pass if int(record['sf']): record['property_type'] = 'Single family' elif int(record['smf']): record['property_type'] = 'Multi-unit' elif int(record['condo']): record['property_type'] = 'Condo' else: record['property_type'] = 'Unknown' return record
def clean_list_record(self, record): if record['geo_code'] != self.geo_code: raise SkipRecord record = LiquorLicenseScraper.clean_list_record(self, record) record['type'], record['dup'] = record.pop('type_dup').split('/') record['expir_date'] = parse_date(record['expir_date'], '%m/%d/%Y') return record
def clean_list_record(self, record): # Collapse the violations into a single value, rather than 58 values, # most of which are zero. num_violations = [] for i in range(1, 59): val = int(record.pop('vio%s' % i)) if val: num_violations.append((str(i), val)) record['violations'] = num_violations record['inspection_date'] = parse_date(record['inspection_date'], '%m/%d/%Y') record['address'] = clean_address(record['address']) if record['city'] not in self.city_names: raise SkipRecord('Skipping city %s' % record['city']) record['city'] = record['city'].title() record['visit_number'] = int(record['visit_number']) record['critical_violations'] = int(record['critical_violations']) record['noncritical_violations'] = int( record['noncritical_violations']) record['total_violations'] = int(record['total_violations']) record['inspection_number'] = int(record['inspection_number']) return record
def clean_list_record(self, record): if record['WORKDESC'] == '4635': raise SkipRecord('WORKDESC is 4635') if record['JOBLOCATION'] is None: raise SkipRecord('Record has no location') if record['PERMITAPPROVALS'] is None: record['permit_types'] = [] else: record['permit_types'] = [ self.clean_permit_type(pa) for pa in record['PERMITAPPROVALS'].split(',') ] # Addresses and extra data generally seem to be separated by 2 or more spaces record['location'] = record['JOBLOCATION'].split(' ')[0] for format in ['%d-%b-%y', '%m/%d/%Y']: try: issue_date = parse_date(record['ISSUEDATE'], format) break except ValueError: continue record['issue_date'] = issue_date record['WORKDESC'] = record['WORKDESC'].strip() record['SUBDESC'] = record['SUBDESC'].strip() return record
def clean_list_record(self, record): record['inspection_date'] = parse_date(record['inspection_date'], '%m/%d/%y') # The PDFs don't include any sort of unique ID for a restaurant, so we # create our own by hashing the restaurant name and address. record['restaurant_hash'] = md5.new('%s.%s' % (record['restaurant_name'].upper().encode('utf8'), record['address'].upper().encode('utf8'))).hexdigest() record['raw_address'] = record['address'].upper() record['raw_city'] = record['city'].upper() clean_violations = [] notes = [] for vio in record['violation_list']: # Split the violation into code and comment. The tricky part here # is that this text could be split over multiple lines, and # sometimes the text that begins a line *looks* like a code but # really isn't. The prime example is "215.685.7498" -- the phone # number for the health department -- which we special-case in the # following regex. m = re.search(r'^(?!215[-\.]685[-\.]74|856[-\.]912[-\.]4193)(\d[-\d\.]+(?::?\s*?\([A-Z]\))?)\s+(.*)$', vio) if m: vio_code, vio_comment = m.groups() vio_code = re.sub(r'\s\s+', ' ', vio_code) # Collapse multiple spaces into one. # Violation comments have stuff like 'foo-------bar', so clean that up. vio_comment = re.sub(r'\s*--+\s*', ' -- ', vio_comment) clean_violations.append((vio_code, vio_comment)) else: notes.append(vio) record['violation_list'] = clean_violations record['notes'] = ' '.join(notes) return record
def clean_list_record(self, record): notes = [] notes_pats = [r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*w\/d.*)', r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*withd.*)', r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*ch\s+227\s+sec\s*5A.*)', r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*ch\s+bus\s+.*)', r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*c\/l.*)', ] # strip notes off of several cruft-prone fields for field in ['name', 'business_type', 'location']: val = record.get(field, '').strip() for pat in notes_pats: m = re.match(pat, val, re.I|re.M) if m is not None: results = m.groupdict() val = results['value'] notes.append(results['notes']) record[field] = val.strip() record['notes'] = notes record['location'] = smart_title(record['location'].strip()) record['date'] = parse_date(record['date'].strip(), '%Y-%m-%d') if (record['name'].upper(), record['location'].upper()) in BUSINESS_NAMES_TO_IGNORE: raise SkipRecord('Skipping %s (explicitly ignored)' % record['name']) if (record['location'] == ''): raise SkipRecord('Skipping %s (no location)' % record['name']) return record
def clean_list_record(self, record): record['CALL_DATE'] = parse_date(record['CALL_DATE'], '%Y-%m-%d') record['OFFENSE_DATE'] = parse_date(record['OFFENSE_DATE'], '%Y-%m-%d') record['REPORT_DATE'] = parse_date(record['REPORT_DATE'], '%Y-%m-%d') record['COMMON_LOCATION'] = record['COMMON_LOCATION'].strip() record['ADDRESS_NBR'] = record['ADDRESS_NBR'].strip() record['ADDRESS'] = record['ADDRESS'].strip() # The 'NARRATIVE' field includes time and disposition data. Parse that out. m = re.search(r'^Time: (?P<TIME>\d\d?:\d\d)<br>.*?<br>Disposition: (?P<DISPOSITION>.*)$', record.pop('NARRATIVE')) record.update(m.groupdict()) record['TIME'] = parse_time(record['TIME'], '%H:%M') # Set location_name. The logic is different depending on the ADDRESS_TYPE. address_type = record['ADDRESS_TYPE'] if address_type == 'PREMISE ADDRESS': record['location_name'] = '%s block of %s' % (record['ADDRESS_NBR'], clean_address(record['ADDRESS'])) elif address_type == 'INTERSECTION': if '/' in record['ADDRESS']: streets = record['ADDRESS'].split('/') record['location_name'] = '%s and %s' % (clean_address(streets[0]), clean_address(streets[1])) else: record['location_name'] = clean_address(record['ADDRESS']) elif address_type == 'GEO-OVERRIDE': record['location_name'] = clean_address(record['ADDRESS']) elif address_type == 'COMMON LOCATION': if record['ADDRESS_NBR'] and record['ADDRESS']: record['location_name'] = '%s %s' % (record['ADDRESS_NBR'], clean_address(record['ADDRESS'])) elif record['ADDRESS'] and record['COMMON_LOCATION']: record['location_name'] = '%s (%s)' % (clean_address(record['ADDRESS']), clean_address(record['COMMON_LOCATION'])) elif record['COMMON_LOCATION']: record['location_name'] = clean_address(record['COMMON_LOCATION']) elif record['ADDRESS']: record['location_name'] = clean_address(record['ADDRESS']) else: record['location_name'] = 'Unknown' else: record['location_name'] = 'Unknown' try: d = CUSTOM_CATEGORIES[record['ORIG_CRIMETYPE_NAME']] except KeyError: d = ('Unknown', 'Unknown') record['broad_category'], record['detail_category'] = d return record
def clean_list_record(self, record): record['permit_date'] = parse_date(record['permit_date'], '%m/%d/%Y') record['description'] = re.sub(r'[\r\n]+', ' ', record['description']).strip() record['description'] = record['description'].decode( 'iso-8859-1') # Avoid database-level encoding errors record['clean_address'] = smart_title(record['address']) return record
def clean_list_record(self, record): record['location'] = self.get_location(record['PERMIT_NUMBER']) if record['location'] is None: raise SkipRecord("No permit found for '%s'" % record['PERMIT_NUMBER']) record['InspectionDate'] = parse_date(record['InspectionDate'], '%Y-%m-%dT00:00:00') return record
def clean_list_record(self, record): record['clean_street_name'] = smart_title( remove_leading_zero(record['streetname'])) record['clean_cross_1'] = smart_title( remove_leading_zero(record['Cross Street 1'].replace(' \ ', ' / '))) record['clean_cross_2'] = smart_title( remove_leading_zero(record['Cross Street 2'].replace(' \ ', ' / '))) record['Permit Reason'] = capfirst( record['Permit Reason'].lower()).replace('Cut off service', 'Cut-off service') record['Effective Date'] = parse_date(record['Effective Date'], '%Y-%m-%d %H:%M:%S') record['Expiration Date'] = parse_date(record['Expiration Date'], '%Y-%m-%d %H:%M:%S') return record
def clean_list_record(self, record): record['name'] = record['name'].strip() record['business_type'] = record['business_type'].strip() record['location'] = smart_title(record['location'].strip()) record['date'] = parse_date(record['date'].strip(), '%Y-%m-%d') if (record['name'].upper(), record['location'].upper()) in BUSINESS_NAMES_TO_IGNORE: raise SkipRecord('Skipping %s' % record['name']) return record
def clean_list_record(self, record): record['CO_Date'] = parse_date(record['CO_Date'], '%m/%d/%Y') mapping = { '328 - Other Not-CO\'able Non-Residential Buildings(well h': '328 - Other CO\'able Non-Res Bldgs(jails,post office)', '437 - All Other CO\'able Buildings / Structures': '437 - All Other Buildings / Structures(additions, remode' } record['project_type'] = mapping.get(record['USDC_Code'], record['USDC_Code']) return record
def clean_list_record(self, record): record["name"] = record["name"].strip() record["business_type"] = record["business_type"].strip() record["location"] = smart_title(record["location"].strip()) record["date"] = parse_date(record["date"].strip(), "%Y-%m-%d") if (record["name"].upper(), record["location"].upper()) in BUSINESS_NAMES_TO_IGNORE: raise SkipRecord("Skipping %s" % record["name"]) return record
def clean_detail_record(self, record): record['location'] = re.sub(r'\s+', ' ', record['location']) entry_datetime = parse_date(record['entry_date'], '%Y-%m-%d %H:%M:%S', return_datetime=True) record['entry_time'] = entry_datetime.time() record['entry_date'] = entry_datetime.date() return record
def clean_detail_record(self, record): record['expiration_date'] = parse_date(record['expiration_date'], '%m/%d/%Y') record['issue_date'] = parse_date(record['issue_date'], '%m/%d/%Y') # The PDF text is in the ISO-8859-1 encoding. Convert it here so that # we don't get an encoding error when we save it to the database. record['text'] = record['text'].decode('iso-8859-1') record['text'] = record['text'].replace( 'Display This Permit While Work Is In Progress', '') record['text'] = record['text'].strip() # Remove the "ISSUED TO" section, as it contains the name of the person # who owns the property, and we have a policy of not displaying names. # Note that we include a sanity check that the "ISSUED TO" section # doesn't contain more than 9 newlines, as that would signify a broken # regular expression. m = issued_re.search(record['text']) if m and m.group(0).count('\n') < 10: record['text'] = issued_re.sub('', record['text']) if record['second_line_of_district'].strip(): record['historic_district'] += record['second_line_of_district'] if record['district_or_landmark'] == 'HISTORIC DISTRICT': record['landmark'] = 'N/A' else: record['landmark'] = record['historic_district'] record['historic_district'] = 'N/A' # Check for a duplicate record. Because the scraper works in # reverse-chronological order, we can safely raise StopScraping if we # reach a duplicate. try: qs = NewsItem.objects.filter(schema__id=self.schema.id) qs = qs.by_attribute(self.schema_fields['docket'], record['docket']) qs = qs.by_attribute(self.schema_fields['cofa'], record['cofa']) old_record = qs[0] except IndexError: pass else: raise StopScraping('Found a duplicate record %s' % old_record.id) return record
def clean_list_record(self, record): created_datetime = parse_date(record['created_time'], '%m/%d/%Y %H:%M', return_datetime=True) record['created_date'] = created_datetime.date() record['created_time'] = created_datetime.time() record['x_coord'] = float(record['x_coord']) record['y_coord'] = float(record['y_coord']) return record
def clean_detail_record(self, record): zones = record['zone'] and record['zone'].split(',') or [] record['zone'] = [self.clean_zone(z) for z in zones if z != ' '] for key in ['application_date', 'complete_date']: if record.has_key(key): record[key] = parse_date(record[key], '%m/%d/%Y') else: record[key] = None return record
def clean_list_record(self, record): for k, v in record.items(): record[k] = strip_unneeded_tags(v) if not record['street_name']: raise SkipRecord('Street name not found') record['start_date'] = parse_date(record['start_date'], '%m/%d/%y') record['end_date'] = parse_date(record['end_date'], '%m/%d/%y') record['date_posted'] = parse_date( '%s_%s' % (record.pop('update_date'), record.pop('update_time')), '%m/%d/%Y_%I:%M %p', return_datetime=True) record['address'] = '%s-%s %s %s %s' % ( record['block_from'], record['block_to'], record['street_dir'], record['street_name'], record['street_suffix']) record['details'] = re.sub(r'(?i)<br>', '\n', record['details']).replace(' ', ' ').strip() return record