示例#1
0
def _daterange_filter(query, params, state):
    """
    handles filtering by start and end date
    paramters: startdate, enddate
    """
    startdate = params.get('startdate')
    if startdate is not None:
        try:
            del params['startdate']
            try:
                startdate = parse_date(startdate, '%Y-%m-%d')
            except ValueError:
                startdate = pyrfc3339.parse(startdate)
            query = query.filter(pub_date__gte=startdate)
        except ValueError:
            raise QueryError(
                'Invalid start date "%s", must be YYYY-MM-DD or rfc3339' %
                startdate)

    enddate = params.get('enddate')
    if enddate is not None:
        try:
            del params['enddate']
            try:
                enddate = parse_date(enddate, '%Y-%m-%d')
            except ValueError:
                enddate = pyrfc3339.parse(enddate)
            query = query.filter(pub_date__lte=enddate)
        except ValueError:
            raise QueryError(
                'Invalid end date "%s", must be YYYY-MM-DD or rfc3339' %
                enddate)

    return query, params, state
示例#2
0
 def clean_list_record(self, record):
     if record['lat'] == '0' or record['long'] == '0':
         raise SkipRecord('Got 0 for lat/long')
     record['lat'] = float(record['lat'])
     record['long'] = float(record['long'])
     record['location_name'] = '%s %s, from %s to %s' % (
         record['street'], record['quadrant'], record['fromintersection'],
         record['tointersection'])
     # record['order_date'] = parse_date(record['serviceorderdate'].split('T')[0], '%Y-%m-%d')
     # record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d')
     if record['estimatedcompletiondate']:
         record['estimated_completion_date'] = parse_date(
             record['estimatedcompletiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['estimated_completion_date'] = None
     if record['actualcompletiondate']:
         record['actual_completion_date'] = parse_date(
             record['actualcompletiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['actual_completion_date'] = None
     if record['estimatedstartdate']:
         record['estimated_start_date'] = parse_date(
             record['estimatedstartdate'].split('T')[0], '%Y-%m-%d')
     else:
         record['estimated_start_date'] = None
     record['project_name'] = record['projectname'] or ''
     record['road_type'] = record['functionalclass'] or 'N/A'
     return record
示例#3
0
 def clean_detail_record(self, record):
     for i, license in enumerate(record['licenses']):
         license['issue_date'] = license['issue_date'] != 'null' and parse_date(license['issue_date'], '%m/%d/%Y') or None
         license['expiration_date'] = license['expiration_date'] != 'null' and parse_date(license['expiration_date'], '%m/%d/%Y') or None
         if license['issue_date'] is None:
             del record['licenses'][i] # If the issue date is empty, just drop it.
     return record
示例#4
0
    def clean_detail_record(self, record):
        if 'No Active DBA found' in record['business_name']:
            record['business_name'] = ''
        else:
            m = re.search(r'(?si)<tr><td><b>Doing Business As: </b>(.*?)</td></tr>', record['business_name'])
            if not m:
                raise ScraperBroken('Got unknown business_name value %r' % record['business_name'])
            record['business_name'] = m.group(1)
        record['address'] = record['address'].strip()

        # There can be multiple license types, so this requires further parsing
        # to create a list.
        license_types = []
        for m in license_types_re.finditer(record['license_types']):
            d = m.groupdict()
            d['status_date'] = parse_date(d['status_date'], '%d-%b-%Y')
            if not d['status_date']:
                # Skip license types that don't have a status date, because
                # a NewsItem is required to have an item_date, and we don't
                # care about licenses that don't have a change date.
                continue
            d['original_issue_date'] = parse_date(d['original_issue_date'], '%d-%b-%Y')
            d['expiration_date'] = parse_date(d['expiration_date'], '%d-%b-%Y')
            d['term'] = d['term'].replace('</B>', '').strip()
            license_types.append(d)
        record['license_types'] = license_types

        return record
示例#5
0
 def clean_list_record(self, record):
     if record['lat'] == '0' or record['long'] == '0':
         raise SkipRecord('Got 0 for lat/long')
     if record['lat'] is None or record['long'] is None:
         raise SkipRecord('Got no value for lat/long')
     if (record['lat'] == '' or record['long'] == 0) and record['siteaddress'] == '':
         raise SkipRecord('No value found for lat/long or address')
     record['lat'] = float(record['lat'])
     record['long'] = float(record['long'])
     if record['servicetypecode'] in ('METERS',):
         raise SkipRecord('Skipping parking meter data')
     record['order_date'] = parse_date(record['serviceorderdate'].split('T')[0], '%Y-%m-%d')
     record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d')
     if record['resolutiondate']:
         record['resolution_date'] = parse_date(record['resolutiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['resolution_date'] = None
     if record['inspectiondate']:
         record['inspection_date'] = parse_date(record['inspectiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['inspection_date'] = None
     if record['serviceduedate']:
         record['service_due_date'] = parse_date(record['serviceduedate'].split('T')[0], '%Y-%m-%d')
     else:
         record['service_due_date'] = None
     record['service_notes'] = record['servicenotes'] or ''
     record['inspection_complete'] = (record['inspectionflag'] == 'Y')
     return record
 def clean_list_record(self, record):
     if record["lat"] == "0" or record["long"] == "0":
         raise SkipRecord("Got 0 for lat/long")
     record["lat"] = float(record["lat"])
     record["long"] = float(record["long"])
     record["location_name"] = "%s %s, from %s to %s" % (
         record["street"],
         record["quadrant"],
         record["fromintersection"],
         record["tointersection"],
     )
     # record['order_date'] = parse_date(record['serviceorderdate'].split('T')[0], '%Y-%m-%d')
     # record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d')
     if record["estimatedcompletiondate"]:
         record["estimated_completion_date"] = parse_date(
             record["estimatedcompletiondate"].split("T")[0], "%Y-%m-%d"
         )
     else:
         record["estimated_completion_date"] = None
     if record["actualcompletiondate"]:
         record["actual_completion_date"] = parse_date(record["actualcompletiondate"].split("T")[0], "%Y-%m-%d")
     else:
         record["actual_completion_date"] = None
     if record["estimatedstartdate"]:
         record["estimated_start_date"] = parse_date(record["estimatedstartdate"].split("T")[0], "%Y-%m-%d")
     else:
         record["estimated_start_date"] = None
     record["project_name"] = record["projectname"] or ""
     record["road_type"] = record["functionalclass"] or "N/A"
     return record
示例#7
0
    def clean_list_record(self, record):
        try:
            record['Date'] = parse_date(record['Date'], '%m/%d/%Y') # 12/31/2007
        except ValueError:
            record['Date'] = parse_date(record['Date'], '%m/%d/%y') # 12/31/07
        for key in ('Location', 'Notes', 'Title', 'Type'):
            if key in record and record[key]:
                record[key] = record[key].strip()
            else:
                record[key] = ''

        record['Location'] = smart_title(record['Location'])
        record['Title'] = smart_title(record['Title'])

        # This is temporary! The CSV files we get are inconsistent -- sometimes
        # they're only films and don't have a "Type" field.
        if record['Type'] == '':
            record['Type'] = 'Film'

        # Normalize inconsistent data.
        if record['Type'] in ('Stills', 'Still'):
            record['Type'] = 'Still photography'
        if record['Type'] in ('Fim', 'Movie'):
            record['Type'] = 'Film'

        return record
示例#8
0
 def clean_detail_record(self, record):
     for i, license in enumerate(record['licenses']):
         license['issue_date'] = license['issue_date'] != 'null' and parse_date(license['issue_date'], '%m/%d/%Y') or None
         license['expiration_date'] = license['expiration_date'] != 'null' and parse_date(license['expiration_date'], '%m/%d/%Y') or None
         if license['issue_date'] is None:
             del record['licenses'][i] # If the issue date is empty, just drop it.
     return record
def _daterange_filter(query, params, state):
    """
    handles filtering by start and end date
    paramters: startdate, enddate
    """
    startdate = params.get('startdate')
    if startdate is not None:
        try:
            del params['startdate']
            try:
                startdate = parse_date(startdate, '%Y-%m-%d')
            except ValueError:
                startdate = pyrfc3339.parse(startdate)
            query = query.filter(pub_date__gte=startdate)
        except ValueError:
            raise QueryError('Invalid start date "%s", must be YYYY-MM-DD or rfc3339' % startdate)
    
    enddate = params.get('enddate')
    if enddate is not None:
        try:
            del params['enddate']
            try:
                enddate = parse_date(enddate, '%Y-%m-%d')
            except ValueError: 
                enddate = pyrfc3339.parse(enddate)
            query = query.filter(pub_date__lte=enddate)
        except ValueError:
            raise QueryError('Invalid end date "%s", must be YYYY-MM-DD or rfc3339' % enddate)

    return query, params, state
示例#10
0
 def clean_list_record(self, record):
     if record['lat'] == '0' or record['long'] == '0':
         raise SkipRecord('Got 0 for lat/long')
     if record['lat'] is None or record['long'] is None:
         raise SkipRecord('Got no value for lat/long')
     if (record['lat'] == ''
             or record['long'] == 0) and record['siteaddress'] == '':
         raise SkipRecord('No value found for lat/long or address')
     record['lat'] = float(record['lat'])
     record['long'] = float(record['long'])
     if record['servicetypecode'] in ('METERS', ):
         raise SkipRecord('Skipping parking meter data')
     record['order_date'] = parse_date(
         record['serviceorderdate'].split('T')[0], '%Y-%m-%d')
     record['add_date'] = parse_date(record['adddate'].split('T')[0],
                                     '%Y-%m-%d')
     if record['resolutiondate']:
         record['resolution_date'] = parse_date(
             record['resolutiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['resolution_date'] = None
     if record['inspectiondate']:
         record['inspection_date'] = parse_date(
             record['inspectiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['inspection_date'] = None
     if record['serviceduedate']:
         record['service_due_date'] = parse_date(
             record['serviceduedate'].split('T')[0], '%Y-%m-%d')
     else:
         record['service_due_date'] = None
     record['service_notes'] = record['servicenotes'] or ''
     record['inspection_complete'] = (record['inspectionflag'] == 'Y')
     return record
示例#11
0
    def clean_list_record(self, record):
        try:
            record['Date'] = parse_date(record['Date'],
                                        '%m/%d/%Y')  # 12/31/2007
        except ValueError:
            record['Date'] = parse_date(record['Date'], '%m/%d/%y')  # 12/31/07
        for key in ('Location', 'Notes', 'Title', 'Type'):
            if key in record and record[key]:
                record[key] = record[key].strip()
            else:
                record[key] = ''

        record['Location'] = smart_title(record['Location'])
        record['Title'] = smart_title(record['Title'])

        # This is temporary! The CSV files we get are inconsistent -- sometimes
        # they're only films and don't have a "Type" field.
        if record['Type'] == '':
            record['Type'] = 'Film'

        # Normalize inconsistent data.
        if record['Type'] in ('Stills', 'Still'):
            record['Type'] = 'Still photography'
        if record['Type'] in ('Fim', 'Movie'):
            record['Type'] = 'Film'

        return record
示例#12
0
 def clean_list_record(self, record):
     record['start_date'] = parse_date(record['start_date'], '%m/%d/%Y')
     record['end_date'] = parse_date(record['end_date'], '%m/%d/%Y')
     crime_types = ('murder', 'rape', 'robbery', 'felony_assault', 'burglary', 'grand_larceny', 'grand_larceny_auto')
     for key in crime_types:
         record[key] = int(record[key])
     record['total'] = sum([record[key] for key in crime_types])
     return record
示例#13
0
 def clean_list_record(self, record):
     record['clean_street_name'] = smart_title(remove_leading_zero(record['streetname']))
     record['clean_cross_1'] = smart_title(remove_leading_zero(record['Cross Street 1'].replace(' \ ', ' / ')))
     record['clean_cross_2'] = smart_title(remove_leading_zero(record['Cross Street 2'].replace(' \ ', ' / ')))
     record['Permit Reason'] = capfirst(record['Permit Reason'].lower()).replace('Cut off service', 'Cut-off service')
     record['Effective Date'] = parse_date(record['Effective Date'], '%Y-%m-%d %H:%M:%S')
     record['Expiration Date'] = parse_date(record['Expiration Date'], '%Y-%m-%d %H:%M:%S')
     return record
示例#14
0
 def clean_list_record(self, record):
     record['start_date'] = parse_date(record['start_date'], '%m/%d/%Y')
     record['end_date'] = parse_date(record['end_date'], '%m/%d/%Y')
     crime_types = ('murder', 'rape', 'robbery', 'felony_assault', 'burglary', 'grand_larceny', 'grand_larceny_auto')
     for key in crime_types:
         record[key] = int(record[key])
     record['total'] = sum([record[key] for key in crime_types])
     return record
示例#15
0
 def clean_list_record(self, record):
     record['From Date'] = parse_date(record['From Date'], '%Y-%m-%d %H:%M:%S')
     record['To Date'] = parse_date(record['To Date'], '%Y-%m-%d %H:%M:%S')
     if record['From Date'] is None and isinstance(record['To Date'], datetime.date):
         record['From Date'] = record['To Date']
     elif record['To Date'] is None and isinstance(record['From Date'], datetime.date):
         record['To Date'] = record['From Date']
     record['Description'] = record['Description'].strip()
     return record
示例#16
0
 def clean_list_record(self, record):
     for k, v in record.items(): record[k] = strip_unneeded_tags(v)
     if not record['street_name']:
         raise SkipRecord('Street name not found')
     record['start_date'] = parse_date(record['start_date'], '%m/%d/%y')
     record['end_date'] = parse_date(record['end_date'], '%m/%d/%y')
     record['date_posted'] = parse_date('%s_%s' % (record.pop('update_date'), record.pop('update_time')), '%m/%d/%Y_%I:%M %p', return_datetime=True)
     record['address'] = '%s-%s %s %s %s' % (record['block_from'], record['block_to'], record['street_dir'], record['street_name'], record['street_suffix'])
     record['details'] = re.sub(r'(?i)<br>', '\n', record['details']).replace('&nbsp;', ' ').strip()
     return record
示例#17
0
 def clean_list_record(self, record):
     if record['geo_code'] != self.geo_code:
         raise SkipRecord
     record = LiquorLicenseScraper.clean_list_record(self, record)
     record['type'], record['dup'] = record.pop('type_dup').split('/')
     record['status_from'], record['status_to'] = record.pop('status_changed_from_to').split(' / ')
     record['transfer_info_from'], record['transfer_info_to'] = record.pop('transfer_info_from_to').split('/')
     record['orig_iss_date'] = parse_date(record['orig_iss_date'], '%m/%d/%Y')
     record['expir_date'] = parse_date(record['expir_date'], '%m/%d/%Y')
     return record
示例#18
0
    def clean_list_record(self, record):
        record['CALL_DATE'] = parse_date(record['CALL_DATE'], '%Y-%m-%d')
        record['OFFENSE_DATE'] = parse_date(record['OFFENSE_DATE'], '%Y-%m-%d')
        record['REPORT_DATE'] = parse_date(record['REPORT_DATE'], '%Y-%m-%d')
        record['COMMON_LOCATION'] = record['COMMON_LOCATION'].strip()
        record['ADDRESS_NBR'] = record['ADDRESS_NBR'].strip()
        record['ADDRESS'] = record['ADDRESS'].strip()

        # The 'NARRATIVE' field includes time and disposition data. Parse that out.
        m = re.search(
            r'^Time: (?P<TIME>\d\d?:\d\d)<br>.*?<br>Disposition: (?P<DISPOSITION>.*)$',
            record.pop('NARRATIVE'))
        record.update(m.groupdict())

        record['TIME'] = parse_time(record['TIME'], '%H:%M')

        # Set location_name. The logic is different depending on the ADDRESS_TYPE.
        address_type = record['ADDRESS_TYPE']
        if address_type == 'PREMISE ADDRESS':
            record['location_name'] = '%s block of %s' % (
                record['ADDRESS_NBR'], clean_address(record['ADDRESS']))
        elif address_type == 'INTERSECTION':
            if '/' in record['ADDRESS']:
                streets = record['ADDRESS'].split('/')
                record['location_name'] = '%s and %s' % (clean_address(
                    streets[0]), clean_address(streets[1]))
            else:
                record['location_name'] = clean_address(record['ADDRESS'])
        elif address_type == 'GEO-OVERRIDE':
            record['location_name'] = clean_address(record['ADDRESS'])
        elif address_type == 'COMMON LOCATION':
            if record['ADDRESS_NBR'] and record['ADDRESS']:
                record['location_name'] = '%s %s' % (
                    record['ADDRESS_NBR'], clean_address(record['ADDRESS']))
            elif record['ADDRESS'] and record['COMMON_LOCATION']:
                record['location_name'] = '%s (%s)' % (clean_address(
                    record['ADDRESS']), clean_address(
                        record['COMMON_LOCATION']))
            elif record['COMMON_LOCATION']:
                record['location_name'] = clean_address(
                    record['COMMON_LOCATION'])
            elif record['ADDRESS']:
                record['location_name'] = clean_address(record['ADDRESS'])
            else:
                record['location_name'] = 'Unknown'
        else:
            record['location_name'] = 'Unknown'

        try:
            d = CUSTOM_CATEGORIES[record['ORIG_CRIMETYPE_NAME']]
        except KeyError:
            d = ('Unknown', 'Unknown')
        record['broad_category'], record['detail_category'] = d

        return record
示例#19
0
 def clean_list_record(self, record):
     record['From Date'] = parse_date(record['From Date'],
                                      '%Y-%m-%d %H:%M:%S')
     record['To Date'] = parse_date(record['To Date'], '%Y-%m-%d %H:%M:%S')
     if record['From Date'] is None and isinstance(record['To Date'],
                                                   datetime.date):
         record['From Date'] = record['To Date']
     elif record['To Date'] is None and isinstance(record['From Date'],
                                                   datetime.date):
         record['To Date'] = record['From Date']
     record['Description'] = record['Description'].strip()
     return record
示例#20
0
    def clean_list_record(self, record):
        record['application_date'] = parse_date(record['application_date'], '%m/%d/%Y')

        try:
            record['review_date'] = parse_date(record['review_date'], '%m/%d/%Y')
        except ValueError: # sometimes it's 'n/a'
            record['review_date'] = None

        record['address'] = strip_unit(clean_address(record['address']))
        if record['city'] not in self.city_names:
            raise SkipRecord('Skipping city %s' % record['city'])
        record['city'] = record['city'].title()
        return record
示例#21
0
    def clean_list_record(self, record):
        record['application_date'] = parse_date(record['application_date'], '%m/%d/%Y')

        try:
            record['review_date'] = parse_date(record['review_date'], '%m/%d/%Y')
        except ValueError: # sometimes it's 'n/a'
            record['review_date'] = None

        record['address'] = strip_unit(clean_address(record['address']))
        if record['city'] not in self.city_names:
            raise SkipRecord('Skipping city %s' % record['city'])
        record['city'] = record['city'].title()
        return record
示例#22
0
    def clean_list_record(self, record):
        norm_dict_space(record, 'legal_name', 'address_raw', 'license', 'dba', 'start_date', 'application_date')
        record['dba'], record['business_type'] = record['dba'].rsplit(' - ', 1)
        record['start_date'] = parse_date(record['start_date'], '%m/%d/%Y')
        record['application_date'] = parse_date(record['application_date'], '%m/%d/%Y')

        # Remove the "Suite/Apt" or "Floor" clause from the address, if it exists.
        record['address_raw'] = record['address_raw'].replace(' ,', ',')
        m = re.search(r'^(.*?), (?:Suite/Apt|Floor):.*$', record['address_raw'])
        if m:
            record['address_raw'] = m.group(1)

        return record
示例#23
0
 def _parse(date):
     # Ugh, papering over wild proliferation of date formats.
     try:
         date = parse_date(date, '%m/%d/%Y')
     except ValueError:
         try:
             date = parse_date(date, '%Y/%m/%d')
         except ValueError:
             try:
                 date = datetime.date(*map(int, date.split('-')))
             except ValueError:
                 raise BadDateException("Unknown date format on %r" % date)
     return date
示例#24
0
 def _parse(date):
     # Ugh, papering over wild proliferation of date formats.
     try:
         date = parse_date(date, '%m/%d/%Y')
     except ValueError:
         try:
             date = parse_date(date, '%Y/%m/%d')
         except ValueError:
             try:
                 date = datetime.date(*map(int, date.split('-')))
             except ValueError:
                 raise BadDateException("Unknown date format on %r" %
                                        date)
     return date
示例#25
0
    def clean_list_record(self, record):
        # Get the first 2 digits of the UCR, or None.
        ucr1_prefix = record['offenseucr1'] and record[
            'offenseucr1'][:2] or None
        ucr2_prefix = record['offenseucr2'] and record[
            'offenseucr2'][:2] or None

        if ucr1_prefix in SKIPPED_UCR_PREFIXES:
            raise SkipRecord('Skipping record with UCR %s' %
                             record['offenseucr1'])
        if ucr2_prefix in SKIPPED_UCR_PREFIXES:
            raise SkipRecord('Skipping record with UCR %s' %
                             record['offenseucr2'])

        record['address'] = self.normalizer.normalize_address(record)

        # If we can't find the code in the mapping, just use the code itself
        # as the value.
        record['category'] = CATEGORY_MAPPING.get(ucr1_prefix, ucr1_prefix)
        record['crime_type'] = UCR_MAPPING.get(record['offenseucr1'],
                                               record['offenseucr1'])
        record['secondary_category'] = CATEGORY_MAPPING.get(
            ucr2_prefix, ucr2_prefix)
        record['secondary_crime_type'] = UCR_MAPPING.get(
            record['offenseucr2'], record['offenseucr2'])

        record['offensedate'] = parse_date(record['offensedate'], '%m/%d/%Y')
        record['offensestarttime'] = datetime.time(
            *time.strptime(record['offensestarttime'], '%H:%M:%S')[3:5])
        record['offensebeat'] = record['offensebeat'] or ''
        return record
示例#26
0
 def clean_list_record(self, list_record):
     list_record['installation_date'] = parse_date(list_record['installation_date'], '%Y-%m-%d')
     try:
         list_record['rack_count'] = int(list_record['rack_count'])
     except TypeError:
         pass
     return list_record
示例#27
0
 def clean_list_record(self, record):
     if record['county'].upper() not in self.county_names:
         raise SkipRecord('Skipping county %s' % record['county'])
     record['approval_date'] = parse_date(record['approval_date'], self.date_format)
     if record['approval_date'] is None:
         raise SkipRecord('Record has no date.')
     return record
示例#28
0
 def clean_list_record(self, list_record):
     list_record['installation_date'] = parse_date(list_record['installation_date'], '%Y-%m-%d')
     try:
         list_record['rack_count'] = int(list_record['rack_count'])
     except TypeError:
         pass
     return list_record
示例#29
0
 def clean_list_record(self, record):
     if record["county"].upper() not in self.county_names:
         raise SkipRecord("Skipping county %s" % record["county"])
     record["approval_date"] = parse_date(record["approval_date"], self.date_format)
     if record["approval_date"] is None:
         raise SkipRecord("Record has no date.")
     return record
示例#30
0
 def clean_list_record(self, record):
     record['inspection_date'] = parse_date(record['inspection_date'],
                                            '%m/%d/%Y')
     record['address'] = smart_title(record['address'])
     record['restaurant_name'] = smart_title(record['restaurant_name'])
     record['result'] = smart_title(record['result'])
     return record
示例#31
0
 def clean_list_record(self, record):
     if record['county'].upper() not in self.county_names:
         raise SkipRecord('Skipping county %s' % record['county'])
     record['approval_date'] = parse_date(record['approval_date'], self.date_format)
     if record['approval_date'] is None:
         raise SkipRecord('Record has no date.')
     return record
示例#32
0
 def clean_list_record(self, record):
     if record['county'].upper().strip() not in self.counties:
         raise SkipRecord('Record not in %s.' % self.counties)
     record['activity_date'] = parse_date(record['activity_date'], '%m/%d/%Y')
     record['dba'] = smart_title(record['dba'])
     record['address'] = clean_address(record['address'])
     return record
示例#33
0
    def parse_list(self, page):
        page = page.replace('&nbsp;', ' ')

        # First, get the report date by looking for "Report as of XXXX".
        m = re.search(r'(?i)report as of (\w+ \d\d?, \d\d\d\d)</U>', page)
        if not m:
            raise ScraperBroken('Could not find "Report as of" in page')
        report_date = parse_date(m.group(1), '%B %d, %Y')

        # Determine the headers by looking at the <th> tags, and clean them up
        # to match our style for keys in the list_record dictionary (lower
        # case, underscores instead of spaces).
        headers = [h.lower() for h in re.findall('(?i)<th[^>]*>(?:<a[^>]+>)?\s*(.*?)\s*(?:</a>)?</th>', page)]
        headers = [h.replace('<br>', ' ') for h in headers]
        headers = [re.sub(r'[^a-z]', ' ', h) for h in headers]
        headers = [re.sub(r'\s+', '_', h.strip()) for h in headers]

        # Dynamically construct a regex based on the number of headers.
        # Note that this assumes that at most *one* of the headers has an
        # empty name; if more than one header has an empty name, this regex
        # will have multiple named groups with the same name, which will cause
        # an error.
        pattern = '(?si)<tr valign=top class=report_column>%s</tr>'% '\s*'.join(['\s*<td[^>]*>\s*(?:<center>)?\s*(?P<%s>.*?)\s*(?:</center>)?\s*</td[^>]*>\s*' % (h or 'number') for h in headers])
        for record in re.finditer(pattern, page):
            yield dict(record.groupdict(), report_date=report_date)
示例#34
0
    def clean_detail_record(self, record):
        try:
            record['inspection_date'] = parse_date(record['inspection_date'], '%m/%d/%Y')
        except KeyError:
            # Sometimes the inspection_date is missing, for whatever reason.
            # Raise a warning in this case.
            self.logger.info('Record %r has no inspection_date. Skipping.', record)
            raise SkipRecord
        record['violations'] = [(i[0], i[1] == 'Yes') for i in detail_violations_re.findall(record['violations'])]

        # Determine the notes (a textual representation of which violations,
        # if any, were corrected during the inspection).
        corrected_violations = [v[0] for v in record['violations'] if v[1]]
        if record['violations']:
            if not corrected_violations:
                if len(corrected_violations) == 1:
                    note_bit = 'violation was not'
                else:
                    note_bit = 'violations were not'
            elif len(corrected_violations) == 1:
                if len(record['violations']) == 1:
                    note_bit = 'violation was'
                else:
                    note_bit = '%s violation was' % corrected_violations[0]
            else: # Multiple corrected violations.
                note_bit = '%s violations were' % get_text_list(corrected_violations, 'and')
            notes = 'The %s corrected during the inspection.' % note_bit
        else:
            # There's no need for notes if there were no violations.
            notes = ''
        record['notes'] = notes

        return record
示例#35
0
    def clean_list_record(self, record):
        strip_dict(record)
        try:
            record['filing_date'] = parse_date(str(int(record['filing_dat'])), '%m%d%y')
        except ValueError:
            record['filing_date'] = None
        if record['filing_date'] is None:
            self.logger.info('Skipping invalid filing date %r', record['filing_dat'])
            raise SkipRecord
        record['address'] = clean_address(record.pop('address'))
        record['case_number'] = record.pop('case_#')
        record['document_number'] = record.pop('document_#')
        record['pin_number'] = record.pop('pin_number')
        try:
            record['year_of_mortgage'] = str(record.pop('year_of_mo').year)
        except AttributeError:
            record['year_of_mortgage'] = 'Unknown'

        # Normalize inconsistent headers
        for old, new in (('SF', 'sf'), ('SMF', 'smf'), ('Condo', 'condo')):
            try:
                record[new] = record.pop(old)
            except KeyError:
                pass

        if int(record['sf']):
            record['property_type'] = 'Single family'
        elif int(record['smf']):
            record['property_type'] = 'Multi-unit'
        elif int(record['condo']):
            record['property_type'] = 'Condo'
        else:
            record['property_type'] = 'Unknown'

        return record
示例#36
0
 def clean_list_record(self, record):
     if record['geo_code'] != self.geo_code:
         raise SkipRecord
     record = LiquorLicenseScraper.clean_list_record(self, record)
     record['type'], record['dup'] = record.pop('type_dup').split('/')
     record['expir_date'] = parse_date(record['expir_date'], '%m/%d/%Y')
     return record
示例#37
0
    def clean_list_record(self, record):
        # Collapse the violations into a single value, rather than 58 values,
        # most of which are zero.
        num_violations = []
        for i in range(1, 59):
            val = int(record.pop('vio%s' % i))
            if val:
                num_violations.append((str(i), val))
        record['violations'] = num_violations

        record['inspection_date'] = parse_date(record['inspection_date'],
                                               '%m/%d/%Y')
        record['address'] = clean_address(record['address'])

        if record['city'] not in self.city_names:
            raise SkipRecord('Skipping city %s' % record['city'])

        record['city'] = record['city'].title()
        record['visit_number'] = int(record['visit_number'])
        record['critical_violations'] = int(record['critical_violations'])
        record['noncritical_violations'] = int(
            record['noncritical_violations'])
        record['total_violations'] = int(record['total_violations'])
        record['inspection_number'] = int(record['inspection_number'])

        return record
示例#38
0
    def clean_list_record(self, record):
        if record['WORKDESC'] == '4635':
            raise SkipRecord('WORKDESC is 4635')
        if record['JOBLOCATION'] is None:
            raise SkipRecord('Record has no location')
        if record['PERMITAPPROVALS'] is None:
            record['permit_types'] = []
        else:
            record['permit_types'] = [
                self.clean_permit_type(pa)
                for pa in record['PERMITAPPROVALS'].split(',')
            ]

        # Addresses and extra data generally seem to be separated by 2 or more spaces
        record['location'] = record['JOBLOCATION'].split('  ')[0]
        for format in ['%d-%b-%y', '%m/%d/%Y']:
            try:
                issue_date = parse_date(record['ISSUEDATE'], format)
                break
            except ValueError:
                continue
        record['issue_date'] = issue_date
        record['WORKDESC'] = record['WORKDESC'].strip()
        record['SUBDESC'] = record['SUBDESC'].strip()
        return record
示例#39
0
    def clean_list_record(self, record):
        record['inspection_date'] = parse_date(record['inspection_date'], '%m/%d/%y')
        # The PDFs don't include any sort of unique ID for a restaurant, so we
        # create our own by hashing the restaurant name and address.
        record['restaurant_hash'] = md5.new('%s.%s' % (record['restaurant_name'].upper().encode('utf8'), record['address'].upper().encode('utf8'))).hexdigest()
        record['raw_address'] = record['address'].upper()
        record['raw_city'] = record['city'].upper()

        clean_violations = []
        notes = []
        for vio in record['violation_list']:
            # Split the violation into code and comment. The tricky part here
            # is that this text could be split over multiple lines, and
            # sometimes the text that begins a line *looks* like a code but
            # really isn't. The prime example is "215.685.7498" -- the phone
            # number for the health department -- which we special-case in the
            # following regex.
            m = re.search(r'^(?!215[-\.]685[-\.]74|856[-\.]912[-\.]4193)(\d[-\d\.]+(?::?\s*?\([A-Z]\))?)\s+(.*)$', vio)
            if m:
                vio_code, vio_comment = m.groups()
                vio_code = re.sub(r'\s\s+', ' ', vio_code) # Collapse multiple spaces into one.

                # Violation comments have stuff like 'foo-------bar', so clean that up.
                vio_comment = re.sub(r'\s*--+\s*', ' -- ', vio_comment)

                clean_violations.append((vio_code, vio_comment))
            else:
                notes.append(vio)
        record['violation_list'] = clean_violations
        record['notes'] = ' '.join(notes)
        return record
示例#40
0
    def clean_list_record(self, record):
        notes = []
        notes_pats = [r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*w\/d.*)',
                      r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*withd.*)', 
                      r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*ch\s+227\s+sec\s*5A.*)',
                      r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*ch\s+bus\s+.*)',
                      r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*c\/l.*)',
                      ]

        # strip notes off of several cruft-prone fields
        for field in ['name', 'business_type', 'location']:
            val = record.get(field, '').strip()
            for pat in notes_pats: 
                m = re.match(pat, val, re.I|re.M)
                if m is not None: 
                    results = m.groupdict()
                    val = results['value']
                    notes.append(results['notes'])
            record[field] = val.strip()

        record['notes'] = notes
        record['location'] = smart_title(record['location'].strip())
        record['date'] = parse_date(record['date'].strip(), '%Y-%m-%d')
        if (record['name'].upper(), record['location'].upper()) in BUSINESS_NAMES_TO_IGNORE:
            raise SkipRecord('Skipping %s (explicitly ignored)' % record['name'])
        if (record['location'] == ''):
            raise SkipRecord('Skipping %s (no location)' % record['name'])
        return record
示例#41
0
    def clean_list_record(self, record):
        record['CALL_DATE'] = parse_date(record['CALL_DATE'], '%Y-%m-%d')
        record['OFFENSE_DATE'] = parse_date(record['OFFENSE_DATE'], '%Y-%m-%d')
        record['REPORT_DATE'] = parse_date(record['REPORT_DATE'], '%Y-%m-%d')
        record['COMMON_LOCATION'] = record['COMMON_LOCATION'].strip()
        record['ADDRESS_NBR'] = record['ADDRESS_NBR'].strip()
        record['ADDRESS'] = record['ADDRESS'].strip()

        # The 'NARRATIVE' field includes time and disposition data. Parse that out.
        m = re.search(r'^Time: (?P<TIME>\d\d?:\d\d)<br>.*?<br>Disposition: (?P<DISPOSITION>.*)$', record.pop('NARRATIVE'))
        record.update(m.groupdict())

        record['TIME'] = parse_time(record['TIME'], '%H:%M')

        # Set location_name. The logic is different depending on the ADDRESS_TYPE.
        address_type = record['ADDRESS_TYPE']
        if address_type == 'PREMISE ADDRESS':
            record['location_name'] = '%s block of %s' % (record['ADDRESS_NBR'], clean_address(record['ADDRESS']))
        elif address_type == 'INTERSECTION':
            if '/' in record['ADDRESS']:
                streets = record['ADDRESS'].split('/')
                record['location_name'] = '%s and %s' % (clean_address(streets[0]), clean_address(streets[1]))
            else:
                record['location_name'] = clean_address(record['ADDRESS'])
        elif address_type == 'GEO-OVERRIDE':
            record['location_name'] = clean_address(record['ADDRESS'])
        elif address_type == 'COMMON LOCATION':
            if record['ADDRESS_NBR'] and record['ADDRESS']:
                record['location_name'] = '%s %s' % (record['ADDRESS_NBR'], clean_address(record['ADDRESS']))
            elif record['ADDRESS'] and record['COMMON_LOCATION']:
                record['location_name'] = '%s (%s)' % (clean_address(record['ADDRESS']), clean_address(record['COMMON_LOCATION']))
            elif record['COMMON_LOCATION']:
                record['location_name'] = clean_address(record['COMMON_LOCATION'])
            elif record['ADDRESS']:
                record['location_name'] = clean_address(record['ADDRESS'])
            else:
                record['location_name'] = 'Unknown'
        else:
            record['location_name'] = 'Unknown'

        try:
            d = CUSTOM_CATEGORIES[record['ORIG_CRIMETYPE_NAME']]
        except KeyError:
            d = ('Unknown', 'Unknown')
        record['broad_category'], record['detail_category'] = d

        return record
示例#42
0
 def clean_list_record(self, record):
     record['permit_date'] = parse_date(record['permit_date'], '%m/%d/%Y')
     record['description'] = re.sub(r'[\r\n]+', ' ',
                                    record['description']).strip()
     record['description'] = record['description'].decode(
         'iso-8859-1')  # Avoid database-level encoding errors
     record['clean_address'] = smart_title(record['address'])
     return record
示例#43
0
 def clean_list_record(self, record):
     record['location'] = self.get_location(record['PERMIT_NUMBER'])
     if record['location'] is None:
         raise SkipRecord("No permit found for '%s'" %
                          record['PERMIT_NUMBER'])
     record['InspectionDate'] = parse_date(record['InspectionDate'],
                                           '%Y-%m-%dT00:00:00')
     return record
示例#44
0
 def clean_list_record(self, record):
     record['clean_street_name'] = smart_title(
         remove_leading_zero(record['streetname']))
     record['clean_cross_1'] = smart_title(
         remove_leading_zero(record['Cross Street 1'].replace(' \ ',
                                                              ' / ')))
     record['clean_cross_2'] = smart_title(
         remove_leading_zero(record['Cross Street 2'].replace(' \ ',
                                                              ' / ')))
     record['Permit Reason'] = capfirst(
         record['Permit Reason'].lower()).replace('Cut off service',
                                                  'Cut-off service')
     record['Effective Date'] = parse_date(record['Effective Date'],
                                           '%Y-%m-%d %H:%M:%S')
     record['Expiration Date'] = parse_date(record['Expiration Date'],
                                            '%Y-%m-%d %H:%M:%S')
     return record
示例#45
0
 def clean_list_record(self, record):
     record['name'] = record['name'].strip()
     record['business_type'] = record['business_type'].strip()
     record['location'] = smart_title(record['location'].strip())
     record['date'] = parse_date(record['date'].strip(), '%Y-%m-%d')
     if (record['name'].upper(), record['location'].upper()) in BUSINESS_NAMES_TO_IGNORE:
         raise SkipRecord('Skipping %s' % record['name'])
     return record
示例#46
0
 def clean_list_record(self, record):
     record['CO_Date'] = parse_date(record['CO_Date'], '%m/%d/%Y')
     mapping = {
       '328 - Other Not-CO\'able Non-Residential Buildings(well h': '328 - Other CO\'able Non-Res Bldgs(jails,post office)',
       '437 - All Other CO\'able Buildings / Structures': '437 - All Other Buildings / Structures(additions, remode'
     }
     record['project_type'] = mapping.get(record['USDC_Code'], record['USDC_Code'])
     return record
示例#47
0
 def clean_list_record(self, record):
     record["name"] = record["name"].strip()
     record["business_type"] = record["business_type"].strip()
     record["location"] = smart_title(record["location"].strip())
     record["date"] = parse_date(record["date"].strip(), "%Y-%m-%d")
     if (record["name"].upper(), record["location"].upper()) in BUSINESS_NAMES_TO_IGNORE:
         raise SkipRecord("Skipping %s" % record["name"])
     return record
示例#48
0
 def clean_detail_record(self, record):
     record['location'] = re.sub(r'\s+', ' ', record['location'])
     entry_datetime = parse_date(record['entry_date'],
                                 '%Y-%m-%d %H:%M:%S',
                                 return_datetime=True)
     record['entry_time'] = entry_datetime.time()
     record['entry_date'] = entry_datetime.date()
     return record
示例#49
0
 def clean_list_record(self, record):
     if record['county'].upper().strip() not in self.counties:
         raise SkipRecord('Record not in %s.' % self.counties)
     record['activity_date'] = parse_date(record['activity_date'],
                                          '%m/%d/%Y')
     record['dba'] = smart_title(record['dba'])
     record['address'] = clean_address(record['address'])
     return record
示例#50
0
    def clean_detail_record(self, record):
        record['expiration_date'] = parse_date(record['expiration_date'],
                                               '%m/%d/%Y')
        record['issue_date'] = parse_date(record['issue_date'], '%m/%d/%Y')

        # The PDF text is in the ISO-8859-1 encoding. Convert it here so that
        # we don't get an encoding error when we save it to the database.
        record['text'] = record['text'].decode('iso-8859-1')
        record['text'] = record['text'].replace(
            'Display This Permit While Work Is In Progress', '')
        record['text'] = record['text'].strip()

        # Remove the "ISSUED TO" section, as it contains the name of the person
        # who owns the property, and we have a policy of not displaying names.
        # Note that we include a sanity check that the "ISSUED TO" section
        # doesn't contain more than 9 newlines, as that would signify a broken
        # regular expression.
        m = issued_re.search(record['text'])
        if m and m.group(0).count('\n') < 10:
            record['text'] = issued_re.sub('', record['text'])

        if record['second_line_of_district'].strip():
            record['historic_district'] += record['second_line_of_district']

        if record['district_or_landmark'] == 'HISTORIC DISTRICT':
            record['landmark'] = 'N/A'
        else:
            record['landmark'] = record['historic_district']
            record['historic_district'] = 'N/A'

        # Check for a duplicate record. Because the scraper works in
        # reverse-chronological order, we can safely raise StopScraping if we
        # reach a duplicate.
        try:
            qs = NewsItem.objects.filter(schema__id=self.schema.id)
            qs = qs.by_attribute(self.schema_fields['docket'],
                                 record['docket'])
            qs = qs.by_attribute(self.schema_fields['cofa'], record['cofa'])
            old_record = qs[0]
        except IndexError:
            pass
        else:
            raise StopScraping('Found a duplicate record %s' % old_record.id)

        return record
示例#51
0
    def clean_list_record(self, record):
        created_datetime = parse_date(record['created_time'], '%m/%d/%Y %H:%M', return_datetime=True)
        record['created_date'] = created_datetime.date()
        record['created_time'] = created_datetime.time()

        record['x_coord'] = float(record['x_coord'])
        record['y_coord'] = float(record['y_coord'])

        return record
示例#52
0
 def clean_detail_record(self, record):
     zones = record['zone'] and record['zone'].split(',') or []
     record['zone'] = [self.clean_zone(z) for z in zones if z != '&nbsp;']
     for key in ['application_date', 'complete_date']:
         if record.has_key(key):
             record[key] = parse_date(record[key], '%m/%d/%Y')
         else:
             record[key] = None
     return record
示例#53
0
 def clean_detail_record(self, record):
     zones = record['zone'] and record['zone'].split(',') or []
     record['zone'] = [self.clean_zone(z) for z in zones if z != '&nbsp;']
     for key in ['application_date', 'complete_date']:
         if record.has_key(key):
             record[key] = parse_date(record[key], '%m/%d/%Y')
         else:
             record[key] = None
     return record
示例#54
0
 def clean_list_record(self, record):
     for k, v in record.items():
         record[k] = strip_unneeded_tags(v)
     if not record['street_name']:
         raise SkipRecord('Street name not found')
     record['start_date'] = parse_date(record['start_date'], '%m/%d/%y')
     record['end_date'] = parse_date(record['end_date'], '%m/%d/%y')
     record['date_posted'] = parse_date(
         '%s_%s' % (record.pop('update_date'), record.pop('update_time')),
         '%m/%d/%Y_%I:%M %p',
         return_datetime=True)
     record['address'] = '%s-%s %s %s %s' % (
         record['block_from'], record['block_to'], record['street_dir'],
         record['street_name'], record['street_suffix'])
     record['details'] = re.sub(r'(?i)<br>', '\n',
                                record['details']).replace('&nbsp;',
                                                           ' ').strip()
     return record