예제 #1
0
 def clean_list_record(self, record):
     if record['lat'] == '0' or record['long'] == '0':
         raise SkipRecord('Got 0 for lat/long')
     if record['lat'] is None or record['long'] is None:
         raise SkipRecord('Got no value for lat/long')
     if (record['lat'] == ''
             or record['long'] == 0) and record['siteaddress'] == '':
         raise SkipRecord('No value found for lat/long or address')
     record['lat'] = float(record['lat'])
     record['long'] = float(record['long'])
     if record['servicetypecode'] in ('METERS', ):
         raise SkipRecord('Skipping parking meter data')
     record['order_date'] = parse_date(
         record['serviceorderdate'].split('T')[0], '%Y-%m-%d')
     record['add_date'] = parse_date(record['adddate'].split('T')[0],
                                     '%Y-%m-%d')
     if record['resolutiondate']:
         record['resolution_date'] = parse_date(
             record['resolutiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['resolution_date'] = None
     if record['inspectiondate']:
         record['inspection_date'] = parse_date(
             record['inspectiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['inspection_date'] = None
     if record['serviceduedate']:
         record['service_due_date'] = parse_date(
             record['serviceduedate'].split('T')[0], '%Y-%m-%d')
     else:
         record['service_due_date'] = None
     record['service_notes'] = record['servicenotes'] or ''
     record['inspection_complete'] = (record['inspectionflag'] == 'Y')
     return record
예제 #2
0
 def clean_list_record(self, record):
     if record['county'].upper() not in self.county_names:
         raise SkipRecord('Skipping county %s' % record['county'])
     record['approval_date'] = parse_date(record['approval_date'], self.date_format)
     if record['approval_date'] is None:
         raise SkipRecord('Record has no date.')
     return record
예제 #3
0
    def clean_list_record(self, record):
        notes = []
        notes_pats = [r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*w\/d.*)',
                      r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*withd.*)', 
                      r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*ch\s+227\s+sec\s*5A.*)',
                      r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*ch\s+bus\s+.*)',
                      r'(?P<value>.*?)\s*\-*\s*(?P<notes>\(?\s*c\/l.*)',
                      ]

        # strip notes off of several cruft-prone fields
        for field in ['name', 'business_type', 'location']:
            val = record.get(field, '').strip()
            for pat in notes_pats: 
                m = re.match(pat, val, re.I|re.M)
                if m is not None: 
                    results = m.groupdict()
                    val = results['value']
                    notes.append(results['notes'])
            record[field] = val.strip()

        record['notes'] = notes
        record['location'] = smart_title(record['location'].strip())
        record['date'] = parse_date(record['date'].strip(), '%Y-%m-%d')
        if (record['name'].upper(), record['location'].upper()) in BUSINESS_NAMES_TO_IGNORE:
            raise SkipRecord('Skipping %s (explicitly ignored)' % record['name'])
        if (record['location'] == ''):
            raise SkipRecord('Skipping %s (no location)' % record['name'])
        return record
예제 #4
0
    def clean_list_record(self, record):
        # Get the first 2 digits of the UCR, or None.
        ucr1_prefix = record['offenseucr1'] and record[
            'offenseucr1'][:2] or None
        ucr2_prefix = record['offenseucr2'] and record[
            'offenseucr2'][:2] or None

        if ucr1_prefix in SKIPPED_UCR_PREFIXES:
            raise SkipRecord('Skipping record with UCR %s' %
                             record['offenseucr1'])
        if ucr2_prefix in SKIPPED_UCR_PREFIXES:
            raise SkipRecord('Skipping record with UCR %s' %
                             record['offenseucr2'])

        record['address'] = self.normalizer.normalize_address(record)

        # If we can't find the code in the mapping, just use the code itself
        # as the value.
        record['category'] = CATEGORY_MAPPING.get(ucr1_prefix, ucr1_prefix)
        record['crime_type'] = UCR_MAPPING.get(record['offenseucr1'],
                                               record['offenseucr1'])
        record['secondary_category'] = CATEGORY_MAPPING.get(
            ucr2_prefix, ucr2_prefix)
        record['secondary_crime_type'] = UCR_MAPPING.get(
            record['offenseucr2'], record['offenseucr2'])

        record['offensedate'] = parse_date(record['offensedate'], '%m/%d/%Y')
        record['offensestarttime'] = datetime.time(
            *time.strptime(record['offensestarttime'], '%H:%M:%S')[3:5])
        record['offensebeat'] = record['offensebeat'] or ''
        return record
예제 #5
0
    def clean_list_record(self, record):
        if record['WORKDESC'] == '4635':
            raise SkipRecord('WORKDESC is 4635')
        if record['JOBLOCATION'] is None:
            raise SkipRecord('Record has no location')
        if record['PERMITAPPROVALS'] is None:
            record['permit_types'] = []
        else:
            record['permit_types'] = [
                self.clean_permit_type(pa)
                for pa in record['PERMITAPPROVALS'].split(',')
            ]

        # Addresses and extra data generally seem to be separated by 2 or more spaces
        record['location'] = record['JOBLOCATION'].split('  ')[0]
        for format in ['%d-%b-%y', '%m/%d/%Y']:
            try:
                issue_date = parse_date(record['ISSUEDATE'], format)
                break
            except ValueError:
                continue
        record['issue_date'] = issue_date
        record['WORKDESC'] = record['WORKDESC'].strip()
        record['SUBDESC'] = record['SUBDESC'].strip()
        return record
예제 #6
0
    def clean_list_record(self, record):
        record['address'] = '%s %s, %s' % (record['House #'],
                                           smart_title(record['Street Name']),
                                           smart_title(record['Borough']))
        record['is_landmark'] = record['Landmark'] == 'Y'
        record['is_adult_establishment'] = record['Adult Estab'] == 'Y'
        record['is_city_owned'] = record['City Owned'] == 'Y'
        record['illumination_type'] = record.get(
            'Sign Illumination Type', 'Not available') or 'Not illuminated'
        if not isinstance(record['Latest Action Date'], datetime.datetime):
            self.logger.info('Skipping job #%s, with latest action date %s',
                             record.get('Job #'), record['Latest Action Date'])
            raise SkipRecord()

        try:
            record['sign_text'] = record['Text on Sign'].strip()
            if len(record['sign_text']) > 255:
                # Some records are malformed and have a bad and long value
                # for sign text.
                self.logger.info('Skipping job #%s, with Text on Sign %s',
                                 record.get('Job #'), record['Text on Sign'])
                raise SkipRecord()
        except AttributeError:
            try:
                record['sign_text'] = str(int(record['Text on Sign']))
            except TypeError:
                self.logger.info('Skipping job #%s, with Text on Sign %s',
                                 record.get('Job #'), record['Text on Sign'])
                raise SkipRecord()

        try:
            record['sign_for'] = record['Sign Advertising']
        except KeyError:
            record['sign_for'] = record['Usage']

        try:
            record['is_near_highway'] = record['Sign Near Highway'] == 'Y'
        except KeyError:
            # Older spreadsheets don't have a 'Sign Near Highway' column,
            # and there's nothing we can do about it. They have a column called
            # 'Adjacent to Arterial Highway', but that's not necessarily the
            # same thing.
            record['is_near_highway'] = None

        try:
            record['is_changeable_copy'] = record[
                'Sign Changeable Copy'] == 'Y'
        except KeyError:
            # Older spreadsheets don't have a 'Sign Changeable Copy' column,
            # but we can deduce the value: if there's text, then it's not
            # changeable. Otherwise, it's NULL.
            if record['sign_text']:
                record['is_changeable_copy'] = False
            else:
                record['is_changeable_copy'] = None

        return record
예제 #7
0
    def clean_list_record(self, rss_record):
        record = {
            'pub_date': datetime.date(*rss_record.pop('updated_parsed')[:3]),
            'summary': rss_record['summary'].strip(),
        }
        if re.search(r'^(?i)\*UPDATE:', record['summary']):
            m = re.search(
                r'^\*UPDATE:\s*(?P<location_name>[^\*]*)\*\s*(?P<description>.*)\s*-\s*(?P<reporter>.*?)\#\#\#$',
                record['summary'])
            if not m:
                self.logger.warn('Could not parse update %r' %
                                 record['summary'])
                raise SkipRecord('Could not parse update %r' %
                                 record['summary'])
            record.update(m.groupdict())
            record.update({
                'is_update': True,
                'incident_type': '',
                'fire_station': '',
                'radio_channels': '',
                'incident_time': '',
            })
        else:  # Not an update
            m = re.search(
                r'^\*(?P<incident_type>[^\*]*)\*\s*(?P<location_name>[^;]*);\s*MAP (?:\d+[- ]\w\d)?;\s*FS (?P<fire_station>\d+); (?P<description>.*?); Ch:(?P<radio_channels>[\d, ]+)\s*@(?P<incident_time>\d\d?:\d\d [AP]M)?\s*-(?P<reporter>.*?)\#\#\#$',
                record['summary'])
            if not m:
                raise SkipRecord('Could not parse %r' % record['summary'])
            record.update(m.groupdict())
            record['incident_type'] = record['incident_type'].upper(
            )  # Normalize
            record['radio_channels'] = ','.join(
                record['radio_channels'].split(','))
            record['is_update'] = False
        record['description'] = record['description'].replace(
            '&nbsp;', ' ').replace('&quot;', '"').replace('&amp;',
                                                          '&').strip()
        record['location_name'] = record['location_name'].strip()

        # Get the incident ID and message ID from the Google Groups URL.
        # We'll use these as unique identifiers.
        m = re.search(
            r'browse_thread/thread/(?P<incident_id>[^/]*)/(?P<message_id>[^\?]*)\?',
            rss_record['link'])
        if not m:
            raise ScraperBroken('Got weird URL: %r', rss_record['link'])
        record.update(m.groupdict())
        record['link'] = rss_record['link']

        # I can't figure out why this record is causing errors, so for now
        # we'll just skip it.
        if record['message_id'] == '0faabeab3aad8492':
            raise SkipRecord()

        return record
예제 #8
0
 def clean_list_record(self, record):
     if record['lat'] == '0' or record['long'] == '0':
         raise SkipRecord('Got 0 for lat/long')
     record['lat'] = float(record['lat'])
     record['long'] = float(record['long'])
     record['location_name'] = '%s %s, from %s to %s' % (
         record['street'], record['quadrant'], record['fromintersection'],
         record['tointersection'])
     # record['order_date'] = parse_date(record['serviceorderdate'].split('T')[0], '%Y-%m-%d')
     # record['add_date'] = parse_date(record['adddate'].split('T')[0], '%Y-%m-%d')
     if record['estimatedcompletiondate']:
         record['estimated_completion_date'] = parse_date(
             record['estimatedcompletiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['estimated_completion_date'] = None
     if record['actualcompletiondate']:
         record['actual_completion_date'] = parse_date(
             record['actualcompletiondate'].split('T')[0], '%Y-%m-%d')
     else:
         record['actual_completion_date'] = None
     if record['estimatedstartdate']:
         record['estimated_start_date'] = parse_date(
             record['estimatedstartdate'].split('T')[0], '%Y-%m-%d')
     else:
         record['estimated_start_date'] = None
     record['project_name'] = record['projectname'] or ''
     record['road_type'] = record['functionalclass'] or 'N/A'
     return record
예제 #9
0
    def clean_list_record(self, record):
        # clean up a record dict
        venue = record.get('venue', {})
        if not venue:
            raise SkipRecord("No venue")
        location_name_parts = [
            venue.get(key, '').strip()
            for key in ('address_1', 'address_2', 'city', 'state', 'zip')
        ]
        location_name = ', '.join([p for p in location_name_parts if p])
        event_time = datetime.datetime.fromtimestamp(record['time'] / 1000.0,
                                                     local_tz)

        cleaned = {
            'title': text_from_html(record['name']),
            'description': text_from_html(record.get('description', '')),
            'location_name': location_name,
            'location': Point(venue['lon'], venue['lat']),
            'url': record['event_url'],
            'item_date': event_time.date(),
        }
        attributes = {
            'venue_phone': venue.get('phone', ''),
            'venue_name': text_from_html(venue.get('name', '')),
            'start_time': event_time.time(),
            'group_name': record['group']['name'],
        }
        cleaned['_attributes'] = attributes
        return cleaned
예제 #10
0
 def clean_list_record(self, record):
     if record['county'].upper().strip() not in self.counties:
         raise SkipRecord('Record not in %s.' % self.counties)
     record['activity_date'] = parse_date(record['activity_date'],
                                          '%m/%d/%Y')
     record['dba'] = smart_title(record['dba'])
     record['address'] = clean_address(record['address'])
     return record
예제 #11
0
    def clean_list_record(self, record):
        record = super(Scraper, self).clean_list_record(record)

        # Special-case: a bad record identified on 2009-02-11.
        if str(record['inspection_visit_id']) == '3145783' and str(record['license_id']) == '2164597':
            raise SkipRecord('Invalid record')

        return record
예제 #12
0
 def clean_list_record(self, record):
     record['location'] = self.get_location(record['PERMIT_NUMBER'])
     if record['location'] is None:
         raise SkipRecord("No permit found for '%s'" %
                          record['PERMIT_NUMBER'])
     record['InspectionDate'] = parse_date(record['InspectionDate'],
                                           '%Y-%m-%dT00:00:00')
     return record
예제 #13
0
 def clean_list_record(self, record):
     record['name'] = record['name'].strip()
     record['business_type'] = record['business_type'].strip()
     record['location'] = smart_title(record['location'].strip())
     record['date'] = parse_date(record['date'].strip(), '%Y-%m-%d')
     if (record['name'].upper(),
             record['location'].upper()) in BUSINESS_NAMES_TO_IGNORE:
         raise SkipRecord('Skipping %s' % record['name'])
     return record
예제 #14
0
 def clean_list_record(self, record):
     if record['lat'] == '0' or record['long'] == '0':
         raise SkipRecord('Got 0 for lat/long')
     record['lat'] = float(record['lat'])
     record['long'] = float(record['long'])
     record['report_date'] = parse_date(record['reportdatetime'].split('T')[0], '%Y-%m-%d')
     record['address'] = record['blocksiteaddress'].replace(' B/O ', ' block of ')
     if record['narrative'].upper() == 'NO NARRATIVE IS AVAILABLE.':
         record['narrative'] == 'Not available'
     return record
예제 #15
0
    def normalize_address(self, record):
        """
        Addresses are provided with no spaces, so try to find the suffix if
        there is one, then compare the reamining part to the streets table.
        """
        if record['offensestreet'] is None:
            raise SkipRecord('Skipping record with no street')

        street = record['offensestreet']
        matching_suffix = ''

        suffix_groups = [('EXPWY', 'PKWY', 'FRWY', 'BLVD'),
                         ('HWY', 'FRW', 'AVE', 'CIR', 'EXT', 'BLV', 'PKW',
                          'ROW', 'WAY', 'EXP'),
                         ('DR', 'ST', 'RD', 'LN', 'BL', 'TR', 'WY', 'CT', 'PL',
                          'AV', 'CI'), ('P', 'R', 'F', 'D', 'S', 'L')]

        match_found = False
        for group in suffix_groups:
            if match_found:
                break
            for suffix in group:
                if record['offensestreet'].endswith(suffix):
                    street_name = record['offensestreet'][:-len(suffix)]
                    # Try looking up the street name from a dictionary mapping
                    # collapsed street names to names in the streets table.
                    try:
                        street = self.streets[street_name]
                        matching_suffix = suffix
                        match_found = True
                        break
                    except KeyError:
                        # SAINT is encoded as ST in the data, but Saint in the streets table,
                        # so try that if the address starts with ST.
                        if street_name.startswith('ST'):
                            street_name = 'SAINT%s' % street_name[2:]
                            try:
                                street = self.streets[street_name]
                                matching_suffix = suffix
                                match_found = True
                                break
                            except KeyError:
                                continue

        if match_found:
            self.matches_found += 1
        self.records_seen += 1

        normalized_block = record['offenseblock'].lstrip('0')
        if normalized_block[-2:] == 'xx':
            normalized_block = normalized_block.replace('xx', '00')
        address = '%s %s %s %s' % (normalized_block, record['offensedirection']
                                   or '', street, matching_suffix)
        address = re.sub(r'\s+', ' ', address)
        return address_to_block(address)
예제 #16
0
 def clean_list_record(self, record):
     record['address'] = '%s %s, %s' % (record['House #'],
                                        smart_title(record['Street Name']),
                                        smart_title(record['Borough']))
     record['is_landmark'] = record['Landmarked'] == 'Y'
     record['is_adult_establishment'] = record['Adult Estab'] == 'Y'
     record['is_city_owned'] = record['City Owned'] == 'Y'
     if not isinstance(record['Latest Action Date'],
                       (datetime.date, datetime.datetime)):
         raise SkipRecord('Got last action date %s' %
                          record['Latest Action Date'])
     return record
예제 #17
0
 def clean_list_record(self, record):
     try:
         dt = strptime(record['date'].strip(), '%m/%d/%Y %I:%M:%S %p')
         record['incident_date'] = datetime.date(*dt[:3])
         record['incident_time'] = datetime.time(*dt[3:6])
     except ValueError:
         dt = strptime(record['date'].strip(), '%m/%d/%Y')
         record['incident_date'] = datetime.date(*dt[:3])
         record['incident_time'] = None
     record['units'] = record['units'].split()
     if 'mutual aid' in record['type'].lower():
         raise SkipRecord('Skipping mutual aid: %r' % record['type'])
     return record
예제 #18
0
    def clean_list_record(self, record):
        for k, v in record.items():
            v = strip_tags(v)
            record[k] = re.sub(r'(?s)\s\s+', ' ', v).strip()

        # Remove the "Suite/Apt" or "Floor" clause from the address, if it exists.
        record['address'] = record['address'].replace(' ,', ',')
        m = re.search(r'^(.*?), (?:Suite/Apt|Floor):.*$', record['address'])
        if m:
            record['address'] = m.group(1)
        record['address'] = clean_address(record['address'])

        if record['dba'] in SKIPPED_DBAS:
            raise SkipRecord('Skipping %r' % record['dba'])

        # For privacy reasons, skip individuals.
        if record['structure'].upper().strip() == 'INDIVIDUAL':
            raise SkipRecord('Skipping structure=individual')

        record['city_id'] = int(record['city_id'])
        record['site_id'] = int(record['site_id'])
        return record
예제 #19
0
    def clean_list_record(self, record):
        record['application_date'] = parse_date(record['application_date'], '%m/%d/%Y')

        try:
            record['review_date'] = parse_date(record['review_date'], '%m/%d/%Y')
        except ValueError: # sometimes it's 'n/a'
            record['review_date'] = None

        record['address'] = strip_unit(clean_address(record['address']))
        if record['city'] not in self.city_names:
            raise SkipRecord('Skipping city %s' % record['city'])
        record['city'] = record['city'].title()
        return record
예제 #20
0
 def parse_detail(self, page, list_record):
     detail_record = []
     for m in self.parse_detail_re.finditer(page):
         record = m.groupdict()
         if record['inspection_type'] in INSPECTION_TYPES_TO_IGNORE:
             raise SkipRecord('Got ignorable inspection type %r' %
                              record['inspection_type'])
         html = record.pop('html')
         record['violations'] = []
         for vm in self.parse_violations_re.finditer(html):
             record['violations'].append(vm.group(1))
         detail_record.append(record)
     return detail_record
예제 #21
0
 def clean_list_record(self, record):
     # The facility type is determied by the 6th and 7th digits in the
     # facility ID.
     facility_type = record['FAC_ID'][5:7]
     if facility_type in EXCLUDED_FACILITY_TYPES:
         raise SkipRecord('Excluding record from facility type %s' %
                          facility_type)
     record['DATE'] = parse_date(record['DATE']).date()
     record['FAC_NAME'] = record['FAC_NAME'].decode('Latin-1')
     record['FIN_SCORE'] = float(record['FIN_SCORE']) + float(
         record['RAW_SCORE'])
     record['schema_slug'] = self.get_schema_slug(record)
     record['facility_type'] = self.clean_facility_type(facility_type)
     record['result'] = self.get_result(record)
     return record
예제 #22
0
    def clean_list_record(self, record):
        record.title = convert_entities(record['title'])
        record.description = convert_entities(record['description'])
        # Don't know why, but some feeds have 'id' *instead* of 'link'.
        if record.get('id', '').startswith('http'):
            record['link'] = record['id']

        # This tries GeoRSS, RDF Geo, xCal, ...
        point, location_name = self.get_point_and_location_name(record)

        _short_title = record['title'][:30] + '...'

        if not point:
            raise SkipRecord("couldn't geocode any addresses in item '%s...'" %
                             _short_title)

        if not location_name:
            raise SkipRecord(
                "Skip, no location name and failed to reverse geocode %s for %r"
                % (point.wkt, _short_title))

        if not intersects_metro_bbox(point):
            # Check if latitude, longitude seem to be reversed; I've
            # seen that in some bad feeds!
            reversed_loc = Point(point.y, point.x)
            if intersects_metro_bbox(reversed_loc):
                self.logger.info(
                    "Got points in apparently reverse order, flipping them")
                point = reversed_loc
            else:
                raise SkipRecord("Skipping %r as %s,%s is out of bounds" %
                                 (_short_title, point.y, point.x))

        record['location_name'] = location_name
        record['location'] = point
        return record
예제 #23
0
 def save(self, old_record, list_record, detail_record):
     attributes = list_record.pop('attributes', {})
     list_record.setdefault('schema', self.schema.id)
     if not old_record:
         list_record.setdefault('item_date', today())
         list_record.setdefault('pub_date', now())
     from ebpub.db.forms import NewsItemForm
     form = NewsItemForm(list_record, instance=old_record)
     if form.is_valid():
         return self.create_or_update(old_record, attributes,
                                      **form.cleaned_data)
     else:
         self.logger.info("Skipping due to validation failures:")
         for key, val in form.errors.items():
             self.logger.info("%s: %s" % (key, val.as_text()))
         raise SkipRecord(form.errors)
예제 #24
0
 def clean_list_record(self, record):
     record['waiver_date'] = parse_date(record['waiver_date'], '%m/%d/%y')
     record['address'] = (
         '%s %s %s' % (record.pop('street_number', '').strip(),
                       record.pop('street_name', '').strip(),
                       record.pop('street_suffix', '').strip())).strip()
     try:
         record['borough'] = {
             'BK': 'Brooklyn',
             'BX': 'The Bronx',
             'MN': 'Manhattan',
             'QS': 'Queens',
             'SI': 'Staten Island',
         }[record['borough']]
     except KeyError:
         raise SkipRecord('Invalid borough')
     return record
예제 #25
0
 def clean_list_record(self, record):
     for k, v in record.items():
         record[k] = strip_unneeded_tags(v)
     if not record['street_name']:
         raise SkipRecord('Street name not found')
     record['start_date'] = parse_date(record['start_date'], '%m/%d/%y')
     record['end_date'] = parse_date(record['end_date'], '%m/%d/%y')
     record['date_posted'] = parse_date(
         '%s_%s' % (record.pop('update_date'), record.pop('update_time')),
         '%m/%d/%Y_%I:%M %p',
         return_datetime=True)
     record['address'] = '%s-%s %s %s %s' % (
         record['block_from'], record['block_to'], record['street_dir'],
         record['street_name'], record['street_suffix'])
     record['details'] = re.sub(r'(?i)<br>', '\n',
                                record['details']).replace('&nbsp;',
                                                           ' ').strip()
     return record
예제 #26
0
    def clean_list_record(self, record):
        # clean up a record dict
        # Item date, in timezone of the photo owner.
        # Not sure how to determine what that is, so we'll leave it.
        cleaned = {}
        cleaned['item_date'] = datetime.datetime.strptime(
            record['datetaken'], '%Y-%m-%d %H:%M:%S')
        cleaned['item_date'] = cleaned['item_date'].date()
        # Posted date, UTC timestamp.
        pub_date = datetime.datetime.fromtimestamp(float(record['dateupload']),
                                                   utc)
        cleaned['pub_date'] = pub_date.astimezone(local_tz)

        description = record['description']['_content']
        cleaned['description'] = convert_entities(description.strip())

        cleaned['title'] = convert_entities(record['title'])
        x, y = record['longitude'], record['latitude']
        cleaned['location'] = Point((float(x), float(y)))

        # Possibly we could figure out flickr's geo API and resolve
        # the photo's place_id and/or woeid to the place name?  But
        # those are probably not specific enough; reverse-geocode
        # instead.
        try:
            block, distance = reverse_geocode(cleaned['location'])
            cleaned['location_name'] = block.pretty_name
        except ReverseGeocodeError:
            raise SkipRecord("Could not geocode location %s, %s" % (x, y))

        # Don't think any of the urls returned by the API's "extras"
        # correspond to the page? not sure.
        cleaned[
            'url'] = 'http://www.flickr.com/photos/%(owner)s/%(id)s' % record

        attributes = {}
        attributes['sourcename'] = 'Flickr'
        #attributes['photo_id'] = record['id']
        attributes['user_id'] = record['owner']
        attributes['username'] = record['ownername']
        # Thumbnail. 'Small square' photos are 75x75.
        attributes['photo_href'] = record['url_sq']
        cleaned['_attributes'] = attributes
        return cleaned
예제 #27
0
    def clean_list_record(self, record):
        if record['last_inspection_date'].lower() == 'not available':
            raise SkipRecord('No inspection available')
        else:
            record['last_inspection_date'] = parse_date(
                record['last_inspection_date'], '%m/%d/%Y')
        if record['aka']:
            record['aka'] = list_aka_re.findall(record['aka'])[0]
        else:
            record['aka'] = ''
        norm_dict_space(record, 'name', 'dba', 'address')
        record['result'] = record['result'].replace('&nbsp;', '').strip()
        record['city_id'] = int(record['city_id'])

        # Remove the trailing ZIP code from the address, if it exists.
        m = re.search(r'(.*?)\s+\d\d\d\d\d$', record['address'])
        if m:
            record['address'] = m.group(1)
        record['address'] = clean_address(record['address'])

        return record
예제 #28
0
    def clean_list_record(self, record):
        record['DateOfInspection'] = parse_date(record['DateOfInspection'], '%Y-%m-%d')
        if not record['DateOfInspection']:
            raise SkipRecord('Inspection date not given')
        record['Total Time'] = record['Total Time'].strip() or None
        record['EmployeeIDofInspector'] = record['EmployeeIDofInspector'].strip() or None

        # Calculate the score range.
        record['Score'] = record['Score'].strip() or None
        if record['Score']:
            try:
                record['Score'] = int(record['Score'])
            except ValueError:
                raise SkipRecord('Got sketchy non-integer score %r' % record['Score'])
            if record['Score'] >= 91:
                record['score_range'] = '91-100'
            elif record['Score'] >= 81:
                record['score_range'] = '81-90'
            elif record['Score'] >= 71:
                record['score_range'] = '71-80'
            elif record['Score'] >= 61:
                record['score_range'] = '61-70'
            elif record['Score'] >= 51:
                record['score_range'] = '51-60'
            elif record['Score'] >= 41:
                record['score_range'] = '41-50'
            elif record['Score'] >= 31:
                record['score_range'] = '31-40'
            elif record['Score'] >= 21:
                record['score_range'] = '21-30'
            elif record['Score'] >= 11:
                record['score_range'] = '11-20'
            else:
                record['score_range'] = '0-10'
        else:
            record['score_range'] = 'N/A'

        # Get the location data from the locations table.
        try:
            loc = self.locations()[record['LocationIDInspected']]
        except KeyError:
            raise SkipRecord('Location not found')
        # Explicitly convert to a Unicode object using the utf8 codec, because
        # this might contain funky characters.
        record['restaurant_dba'] = loc['DBA'].decode('utf8').strip()
        record['restaurant_type'] = loc['Type Description'].strip()
        record['address'] = loc['StreetAddress'].decode('utf8').strip()
        record['LocationIDInspected'] = int(record['LocationIDInspected'])

        # Get the inspection type data from the inspection types table.
        try:
            inspection_type = self.inspection_types()[record['InspectionTypeID']]
        except KeyError:
            raise SkipRecord('Inspection type not found')
        record['inspection_type'] = inspection_type['InspectionType']

        # Get the violation data from the violations table.
        try:
            vios = self.violations()[record['InspectionID']]
        except KeyError:
            record['violations'] = []
        else:
            vio_types = self.violation_types()
            record['violations'] = [dict(vio, type=vio_types[vio['ViolationTypeID']]) for vio in vios]

        return record
예제 #29
0
    def clean_list_record(self, record):
        if record['title'].startswith(u'Boston 24'):
            # We don't include the summary posts.
            # TODO: the 'Boston 24' tag indicates posts with aggregate
            # daily stats.  Make a separate schema for the aggregates,
            # with attributes like those used in
            # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py.
            # Or maybe not: these are citywide, not by precinct.
            # So what would be the Location?  Whole city??
            self.logger.info("boston daily crime stats, we don't know how to "
                             "handle these yet")
            raise SkipRecord

        date = datetime.date(*record['updated_parsed'][:3])
        description = record['summary']

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        full_description = record['content'][0]['value']
        full_description = text_from_html(full_description)
        location, location_name = self.get_point_and_location_name(
            record, address_text=full_description)

        if not (location or location_name):
            raise SkipRecord("No location or location_name")

        # Get the precinct from the tags.
        precincts = [
            'A1', 'A15', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13',
            'E18', 'E5'
        ]
        tags = [t['term'] for t in record['tags']]
        precinct = None
        for tag in tags:
            if tag in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct. For now we just save it as an attribute.
                precinct = tag
                break

        attributes = {}
        if precinct:
            precinct = self.get_or_create_lookup('precinct', precinct,
                                                 precinct)
            attributes['precinct'] = precinct.id
        else:
            raise SkipRecord("no precinct found in tags %r" % tags)

        cleaned = dict(
            item_date=date,
            location=location,
            location_name=location_name,
            title=record['title'],
            description=description,
            url=record['link'],
            attributes=attributes,
        )

        return cleaned
예제 #30
0
 def clean_list_record(self, record):
     if record['primary_status'] == 'Acknowledged':
         raise SkipRecord('Status is "Acknowledged"')
     return super(ConvertedCondominiumsScraper, self).clean_list_record(record)