Python ScraperBroken 예제들, ebdata.retrieval.scrapers.base.ScraperBroken Python 예제들

예제 #1

0

파일 보기

파일: retrieval.py 프로젝트: frankk00/openblock

    def get_access_db(self, file_date=None):
        """
        Downloads the requested Microsoft Access file, saves it to a temporary
        file and returns the local file name.

        If file_date is None, then this will download the latest Access file.
        Otherwise, it will download the file with the given date, raising
        ScraperBroken if a file isn't available for that date.
        """
        # First, log into the file manager and get the list of all available
        # Microsoft Access (MDB) files.
        params = {'user': USERNAME, 'password': PASSWORD, 'start-url': '/', 'switch': 'Log In'}
        html = self.get_html(FILE_MANAGER_URL, params)
        mdb_files = re.findall(r'PrintFileURL\("(.*?\.mdb)"', html)
        if not mdb_files:
            raise ScraperBroken('Found no MDB files')
        mdb_files.sort()

        if file_date:
            requested_file = 'SFFOOD%s.mdb' % file_date.strftime('%m%d%Y')
            if requested_file not in mdb_files:
                raise ScraperBroken('%r not found. Choices are: %r' % (requested_file, mdb_files))
        else:
            # Assume the last filename in alphabetical order is the latest one.
            requested_file = mdb_files[-1]

        # Finally, download the file and return the local filename.
        mdb_url = urlparse.urljoin(FILE_MANAGER_URL, requested_file)
        filename = self.retriever.get_to_file(mdb_url)
        self.logger.debug('%s saved to %s', mdb_url, filename)
        return filename

예제 #2

0

파일 보기

    def parse_list(self, page):
        # First, get the date and time that the page was updated.
        update_re = re.compile(
            r'(?si)<td height="42" bgcolor="white"><b><font size="3" face="Arial">(?P<update_date>.*?)</font></b></td>\s*</tr>\s*</table>\s*</td>\s*<td width="73" rowspan="2" valign="top">\s*<table border="0" width="71">\s*<tr>\s*<td height="42" bgcolor="white"><b><font size="3" face="Arial">(?P<update_time>.*?)</font></b></td>'
        )
        m = update_re.search(page)
        if not m:
            raise ScraperBroken('Update date not found')
        updated = m.groupdict()

        # Next, get the table that contains the rows we want.
        m = re.search(r'(?si)<table [^>]* width="868">(.*?)</table>', page)
        if not m:
            raise ScraperBroken('Data table not found')
        table = m.group(1)

        # Return each data row in that table *after* the first row (the headers).
        parse_list_re = re.compile(
            r'(?si)<tr>\s*<td[^>]*>(?P<street_name>.*?)</td>\s*<td[^>]*>(?P<street_dir>.*?)</td>\s*<td[^>]*>(?P<block_from>.*?)</td>\s*<td[^>]*>(?P<block_to>.*?)</td>\s*<td[^>]*>(?P<street_suffix>.*?)</td>\s*<td[^>]*>(?P<start_date>.*?)</td>\s*<td[^>]*>(?P<end_date>.*?)</td>\s*<td[^>]*>(?P<closure_type>.*?)</td>\s*<td[^>]*>(?P<details>.*?)</td>\s*</tr>'
        )
        for match in parse_list_re.finditer(table):
            record = match.groupdict()
            if 'street name' in record['street_name'].lower():
                continue  # Skip the header row.
            yield dict(record, **updated)

예제 #3

0

파일 보기

파일: retrieval.py 프로젝트: frankk00/openblock

    def clean_detail_record(self, record):
        if record == {}:
            # Parsing the detail page failed.
            return None
        if record['violation_points'].startswith('Not Available'):
            record['violation_points'] = 'N/A'
            record['followup_inspection'] = False
        else:
            if not record['violation_points'].isdigit():
                raise ScraperBroken('Got odd violation points value %r' %
                                    record['violation_points'])
            record['followup_inspection'] = int(
                record['violation_points']) > 27

        # Parse the violations from the HTML chunk. When we're done,
        # record['violation_list'] will be a (possibly empty) list of strings.
        vio_chunk = record.pop('violations')
        if vio_chunk == '':
            record['violation_list'] = []
        else:
            vios = violation_list_re.findall(vio_chunk)
            if not vios:
                raise ScraperBroken(
                    "Violation data not found for restaurant %s",
                    record['restaurant_name'])
            record['violation_list'] = [
                strip_tags(convert_entities(v.strip())) for v in vios
            ]

        # Remove the ZIP code from the address, as it complicates geocoding.
        record['address'] = re.sub(r'\s*\d{5}\s*$', '', record['address'])
        # Strip extra internal whitespace.
        record['address'] = re.sub(r'\s+', ' ', record['address'])

        return record

예제 #4

0

파일 보기

    def parse_list(self, page):
        page = page.replace('&nbsp;', ' ')

        # First, get the report date by looking for "Report as of XXXX".
        m = re.search(r'(?i)report as of (\w+ \d\d?, \d\d\d\d)</U>', page)
        if not m:
            raise ScraperBroken('Could not find "Report as of" in page')
        report_date = parse_date(m.group(1), '%B %d, %Y')

        # Determine the headers by looking at the <th> tags, and clean them up
        # to match our style for keys in the list_record dictionary (lower
        # case, underscores instead of spaces).
        headers = [h.lower() for h in re.findall('(?i)<th[^>]*>(?:<a[^>]+>)?\s*(.*?)\s*(?:</a>)?</th>', page)]
        headers = [h.replace('<br>', ' ') for h in headers]
        headers = [re.sub(r'[^a-z]', ' ', h) for h in headers]
        headers = [re.sub(r'\s+', '_', h.strip()) for h in headers]

        # Dynamically construct a regex based on the number of headers.
        # Note that this assumes that at most *one* of the headers has an
        # empty name; if more than one header has an empty name, this regex
        # will have multiple named groups with the same name, which will cause
        # an error.
        pattern = '(?si)<tr valign=top class=report_column>%s</tr>'% '\s*'.join(['\s*<td[^>]*>\s*(?:<center>)?\s*(?P<%s>.*?)\s*(?:</center>)?\s*</td[^>]*>\s*' % (h or 'number') for h in headers])
        for record in re.finditer(pattern, page):
            yield dict(record.groupdict(), report_date=report_date)

예제 #5

0

파일 보기

    def get_excel_url(self):
        """
        Returns the full URL for the Excel file for self.month.

        This value is cached the first time the function is called.
        """
        if self._excel_url_cache is None:
            # Download the index page and search all of the ".xls" links for
            # the given month/year.
            index_url = 'http://www.sfgov.org/site/dbi_page.asp?id=30608'
            html = self.get_html(index_url)
            excel_links = re.findall(r'<a href="(.*?\.xls)">(.*?)</a>', html)
            month_name, year = self.month.strftime('%B,%Y').split(',')
            this_month_links = [
                link[0] for link in excel_links
                if (month_name in link[0] or month_name in link[1]) and (
                    year in link[0] or year in link[1])
            ]
            if len(this_month_links) != 1:
                raise ScraperBroken(
                    'Found %s links for %s %s on %s' %
                    (len(this_month_links), month_name, year, index_url))
            self._excel_url_cache = urljoin('http://www.sfgov.org/',
                                            this_month_links[0])
        return self._excel_url_cache

예제 #6

0

파일 보기

    def clean_detail_record(self, record):
        if 'No Active DBA found' in record['business_name']:
            record['business_name'] = ''
        else:
            m = re.search(r'(?si)<tr><td><b>Doing Business As: </b>(.*?)</td></tr>', record['business_name'])
            if not m:
                raise ScraperBroken('Got unknown business_name value %r' % record['business_name'])
            record['business_name'] = m.group(1)
        record['address'] = record['address'].strip()

        # There can be multiple license types, so this requires further parsing
        # to create a list.
        license_types = []
        for m in license_types_re.finditer(record['license_types']):
            d = m.groupdict()
            d['status_date'] = parse_date(d['status_date'], '%d-%b-%Y')
            if not d['status_date']:
                # Skip license types that don't have a status date, because
                # a NewsItem is required to have an item_date, and we don't
                # care about licenses that don't have a change date.
                continue
            d['original_issue_date'] = parse_date(d['original_issue_date'], '%d-%b-%Y')
            d['expiration_date'] = parse_date(d['expiration_date'], '%d-%b-%Y')
            d['term'] = d['term'].replace('</B>', '').strip()
            license_types.append(d)
        record['license_types'] = license_types

        return record

예제 #7

0

파일 보기

파일: utils.py 프로젝트: vijayaraju/everyblock-1

 def get_viewstate(self, uri=None):
     uri = uri or self.root_uri
     html = self.get_html(self.root_uri)
     m = re.search(r'<input type="hidden" name="__VIEWSTATE" value="([^"]*)"', html)
     if not m:
         raise ScraperBroken('VIEWSTATE not found')
     return m.group(1)

예제 #8

0

파일 보기

 def parse_list(self, page):
     records = list(NewsItemListDetailScraper.parse_list(self, page))
     self.logger.debug('Got %s records', len(records))
     if len(records) >= 99:
         raise ScraperBroken(
             'Got %s records. Consider changing date interval' %
             len(records))
     return records

예제 #9

0

파일 보기

파일: retrieval.py 프로젝트: frankk00/openblock

 def parse_list(self, page):
     precinct, raw_pdf = page
     pdf_text = pdfstring_to_text(raw_pdf, keep_layout=False)
     m = pdf_re.search(pdf_text)
     if not m:
         raise ScraperBroken("Didn't find data in PDF for precinct %s" % precinct)
     else:
         yield dict(m.groupdict(), precinct=precinct)

예제 #10

0

파일 보기

파일: retrieval.py 프로젝트: vijayaraju/everyblock-1

    def call_clearpath(self, where, offset=0, limit=2000):
        """
        Makes a request to the CPD site with the given WHERE clause (a list)
        and offset/limit. Returns a DOM object of the parsed XML.

        Note that the maximum limit is 2000. All results will always return
        at most 2000 records.
        """
        # Example valid WHERE clauses:
        #     GIS.clearMap_crime_90days.DATEOCC between {ts &apos;2007-09-01 00:00:00&apos;} AND {ts &apos;2007-09-01 23:59:59&apos;}
        #     GIS.clearMap_crime_90days.DATEOCC &gt;= {ts &apos;2007-09-01&apos;}
        # Good documentation is available here:
        #     http://edndoc.esri.com/arcims/9.2/elements/get_features.htm
        xml_request = """
            <?xml version="1.0" encoding="UTF-8" ?>
            <ARCXML VERSION="1.1">
            <REQUEST>
                <GET_FEATURES outputmode="xml" geometry="true" globalenvelope="false" envelope="false" compact="true" beginrecord="%(offset)s" featurelimit="%(limit)s">
                    <LAYER id="999" type="featureclass">
                        <DATASET name="GIS.clearMap_crime_90days" type="point" workspace="sde_ws-1"  />
                    </LAYER>
                    <SPATIALQUERY where="%(where)s" subfields="#ALL#"></SPATIALQUERY>
                </GET_FEATURES>
            </REQUEST>
            </ARCXML>""" % {
            'where': ' AND '.join(where),
            'offset': offset,
            'limit': limit
        }
        data = {
            'ArcXMLRequest': xml_request.strip(),
            'JavaScriptFunction': 'parent.MapFrame.processXML',
            'BgColor': '#000000',
            'FormCharset': 'ISO-8859-1',
            'RedirectURL': '',
            'HeaderFile': '',
            'FooterFile': '',
        }
        url = 'http://gis.chicagopolice.org/servlet/com.esri.esrimap.Esrimap?ServiceName=clearMap&CustomService=Query&ClientVersion=4.0&Form=True&Encode=False'
        html = self.get_html(url, data)

        # The resulting HTML has some XML embedded in it. Extract that.
        m = re.search(
            r"var XMLResponse='(.*?)';\s*parent\.MapFrame\.processXML", html)
        if not m:
            raise ScraperBroken('"var XMLResponse" XML not found')
        raw_xml = m.group(1)

        # Clean invalid XML --
        # Attributes that start with "#".
        raw_xml = raw_xml.replace('#', 'Z')
        # Unescaped ampersands.
        raw_xml = re.sub(r'&(?!amp;)', '&amp;', raw_xml)
        # Unescaped '<' signs (shows up in "<18" in attributes).
        raw_xml = raw_xml.replace(r'<18', '&lt;18')

        return minidom.parseString(raw_xml)

예제 #11

0

파일 보기

파일: retrieval.py 프로젝트: vijayaraju/everyblock-1

 def list_pages(self):
     html = self.get_html(SOURCE_URL)
     
     m = re.search(r'<input type="hidden" name="__VIEWSTATE" value="([^"]*)"', html)
     if not m:
         raise ScraperBroken('VIEWSTATE not found on %s' % self.source_url)
     viewstate = m.group(1)
     
     yield self.get_html(SOURCE_URL, {'__VIEWSTATE': viewstate, 'ddlEvtHours': self.num_hours, 'btnRefresh': 'Refresh'})

예제 #12

0

파일 보기

 def clean_list_record(self, record):
     try:
         license_number = record.pop('license_num')
     except KeyError:
         license_number = record.pop('license_number')
     m = re.search(r'(?i)<a href=.*?LQSdata\.asp\?ID=(\d+)>\s*(\d+)\s*</a>', license_number)
     if not m:
         raise ScraperBroken('License number link not found in %r' % license_number)
     record['place_id'], record['license_number'] = m.groups()
     return record

예제 #13

0

파일 보기

파일: retrieval.py 프로젝트: frankk00/openblock

 def locations(self):
     """
     Lazily loads *all* locations into memory and returns a dictionary
     keyed by location ID.
     """
     if self._locations_cache is None:
         self._locations_cache = dict([(row['LocationID'], row) for row in self.mdb_table('tblLocations')])
         if not self._locations_cache:
             raise ScraperBroken('tblLocations was either empty or nonexistent')
     return self._locations_cache

예제 #14

0

파일 보기

파일: retrieval.py 프로젝트: frankk00/openblock

 def inspection_types(self):
     """
     Lazily loads *all* inspection types into memory and returns a dictionary
     keyed by inspection type ID.
     """
     if self._inspection_type_cache is None:
         self._inspection_type_cache = dict([(row['InspectionTypeID'], row) for row in self.mdb_table('tblInspectionTypes')])
         if not self._inspection_type_cache:
             raise ScraperBroken('tblInspectionTypes was either empty or nonexistent')
     return self._inspection_type_cache

예제 #15

0

파일 보기

파일: retrieval.py 프로젝트: vijayaraju/everyblock-1

    def clean_list_record(self, rss_record):
        record = {
            'pub_date': datetime.date(*rss_record.pop('updated_parsed')[:3]),
            'summary': rss_record['summary'].strip(),
        }
        if re.search(r'^(?i)\*UPDATE:', record['summary']):
            m = re.search(
                r'^\*UPDATE:\s*(?P<location_name>[^\*]*)\*\s*(?P<description>.*)\s*-\s*(?P<reporter>.*?)\#\#\#$',
                record['summary'])
            if not m:
                self.logger.warn('Could not parse update %r' %
                                 record['summary'])
                raise SkipRecord('Could not parse update %r' %
                                 record['summary'])
            record.update(m.groupdict())
            record.update({
                'is_update': True,
                'incident_type': '',
                'fire_station': '',
                'radio_channels': '',
                'incident_time': '',
            })
        else:  # Not an update
            m = re.search(
                r'^\*(?P<incident_type>[^\*]*)\*\s*(?P<location_name>[^;]*);\s*MAP (?:\d+[- ]\w\d)?;\s*FS (?P<fire_station>\d+); (?P<description>.*?); Ch:(?P<radio_channels>[\d, ]+)\s*@(?P<incident_time>\d\d?:\d\d [AP]M)?\s*-(?P<reporter>.*?)\#\#\#$',
                record['summary'])
            if not m:
                raise SkipRecord('Could not parse %r' % record['summary'])
            record.update(m.groupdict())
            record['incident_type'] = record['incident_type'].upper(
            )  # Normalize
            record['radio_channels'] = ','.join(
                record['radio_channels'].split(','))
            record['is_update'] = False
        record['description'] = record['description'].replace(
            '&nbsp;', ' ').replace('&quot;', '"').replace('&amp;',
                                                          '&').strip()
        record['location_name'] = record['location_name'].strip()

        # Get the incident ID and message ID from the Google Groups URL.
        # We'll use these as unique identifiers.
        m = re.search(
            r'browse_thread/thread/(?P<incident_id>[^/]*)/(?P<message_id>[^\?]*)\?',
            rss_record['link'])
        if not m:
            raise ScraperBroken('Got weird URL: %r', rss_record['link'])
        record.update(m.groupdict())
        record['link'] = rss_record['link']

        # I can't figure out why this record is causing errors, so for now
        # we'll just skip it.
        if record['message_id'] == '0faabeab3aad8492':
            raise SkipRecord()

        return record

예제 #16

0

파일 보기

파일: retrieval.py 프로젝트: vijayaraju/everyblock-1

 def clean_detail_record(self, record):
     body = record.pop('body')
     violations = [m.groupdict() for m in detail_violations_re.finditer(body)]
     if not violations and not 'There are no violations for this inspection' in body:
         raise ScraperBroken('Could not find violations')
     for vio in violations:
         vio['severity'] = {1: 'Non critical', 2: 'Critical', 3: 'Critical foodborne illness'}[vio.pop('stars').count('*')]
         vio['comment'] = strip_tags(vio['comment']).strip()
         vio['location'] = strip_tags(vio['location']).strip()
     record['violation_list'] = violations
     return record

예제 #17

0

파일 보기

파일: retrieval.py 프로젝트: vijayaraju/everyblock-1

 def get_html(self, *args, **kwargs):
     MAX_TRIES = 4
     tries = 0
     while tries <= MAX_TRIES:
         html = NewsItemListDetailScraper.get_html(self, *args, **kwargs)
         if 'Unable to connect to PostgreSQL server' in html:
             self.logger.debug('Got "Unable to connect to PostgreSQL" error')
             time.sleep(3)
             continue
         return html
     raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)

예제 #18

0

파일 보기

파일: new_newsitem_list_detail.py 프로젝트: vijayaraju/everyblock-1

class NewsItemListDetailScraper(BaseScraper):
    def get_page(self, *args, **kwargs):
        """
        Calls NewsItemScraper's get_html method and returns an unsaved ``Page``
        object wrapping the html.
        """
        schema = kwargs.get('schema', None)
        schema = schema or self.schema
        html = super(NewsItemListDetailScraper, self).get_html(*args, **kwargs)
        return ScrapedPage(url=args[0], when_crawled=datetime.datetime.now(), html=html, schema=schema)

    def update_from_string(self, list_page):
        """
        For scrapers with has_detail=False, runs the equivalent of update() on
        the given string.

        This is useful if you've got cached versions of HTML that you want to
        parse.

        Subclasses should not have to override this method.
        """
        # TODO: Setting the page type should probably happen somewhere else.
        list_page.page_type = LIST_PAGE
        self.num_skipped = 0
        for list_record in self.parse_list(list_page.html):
            try:
                list_record = self.clean_list_record(list_record)
            except SkipRecord, e:
                self.num_skipped += 1
                self.logger.debug("Skipping list record for %r: %s " % (list_record, e))
                continue
            except ScraperBroken, e:
                # Re-raise the ScraperBroken with some addtional helpful information.
                raise ScraperBroken('%r -- %s' % (list_record, e))
            self.logger.debug("Clean list record: %r" % list_record)

            old_record = self.existing_record(list_record)
            self.logger.debug("Existing record: %r" % old_record)

            if self.has_detail and self.detail_required(list_record, old_record):
                self.logger.debug("Detail page is required")
                try:
                    detail_page = self.get_detail(list_record)
                    # TODO: Setting the page type should probably happen somewhere else.
                    detail_page.page_type = DETAIL_PAGE
                    detail_record = self.parse_detail(detail_page.html, list_record)
                    detail_record = self.clean_detail_record(detail_record)
                except SkipRecord, e:
                    self.num_skipped += 1
                    self.logger.debug("Skipping detail record for list %r: %s" % (list_record, e))
                    continue
                except ScraperBroken, e:
                    # Re-raise the ScraperBroken with some addtional helpful information.
                    raise ScraperBroken('%r -- %s' % (list_record, e))

예제 #19

0

파일 보기

파일: retrieval.py 프로젝트: frankk00/openblock

 def violations(self):
     """
     Lazily loads *all* violations into memory and returns a dictionary
     keyed by inspection ID.
     """
     if self._violations_cache is None:
         vs = {}
         for row in self.mdb_table('tblViolations'):
             vs.setdefault(row['InspectionID'], []).append(row)
         self._violations_cache = vs
         if not self._violations_cache:
             raise ScraperBroken('tblViolations was either empty or nonexistent')
     return self._violations_cache

예제 #20

0

파일 보기

파일: retrieval.py 프로젝트: vijayaraju/everyblock-1

    def list_pages(self):
        html = self.get_html(self.source_url)

        m = re.search(
            r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="([^"]*)"',
            html)
        if not m:
            raise ScraperBroken('VIEWSTATE not found on %s' % self.source_url)
        viewstate = m.group(1)

        m = re.search(
            r'<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="([^"]*)"',
            html)
        if not m:
            raise ScraperBroken('EVENTVALIDATION not found on %s' %
                                self.source_url)
        eventvalidation = m.group(1)

        yield self.get_html(
            self.source_url, {
                '__VIEWSTATE': viewstate,
                '__EVENTVALIDATION': eventvalidation,
                'cmdFind': 'Find'
            })

예제 #21

0

파일 보기

파일: utils.py 프로젝트: vijayaraju/everyblock-1

 def login(self):
     html = self.get_html(LOGIN_URI)
     m = re.search(r'<input type="hidden" name="__VIEWSTATE" value="([^"]*)"', html)
     if not m:
         raise ScraperBroken('VIEWSTATE not found')
     viewstate = m.group(1)
     html = self.get_html(LOGIN_URI, {
         '__EVENTTARGET': '',
         '__EVENTARGUMENT': '',
         '__VIEWSTATE': viewstate,
         'btn_Submit': 'Login',
         'Remember_Password': '******',
         'User_Name': USERNAME,
         'Password': PASSWORD,
         '_ctl1:QUERY': '',
         'fmt': 'standard'
     }, follow_redirects=False)

예제 #22

0

파일 보기

파일: retrieval.py 프로젝트: vijayaraju/everyblock-1

    def list_pages(self):
        # Get the HTML page, which includes links to Excel files (one for each
        # borough). We do this instead of hard-coding the Excel file names in
        # the scraper because the Excel file names tend to change each month.
        url = 'http://www.nyc.gov/html/dof/html/property/property_val_sales.shtml'
        html = self.get_html(url)
        excel_links = re.findall(r'href="([^"]+\.xls)"', html)
        if len(excel_links) != 12:
            raise ScraperBroken('Got a strange number of Excel links: %s' % len(excel_links))

        # The first five links are the "New York City Sales Data" links,
        # which is what we want.
        for excel_link in excel_links[:5]:
            excel_url = urlparse.urljoin(url, excel_link)
            workbook_path = self.retriever.get_to_file(excel_url)
            reader = ExcelDictReader(workbook_path, sheet_index=0, header_row_num=4, start_row_num=5)
            yield reader
            os.unlink(workbook_path) # Clean up the temporary file.

예제 #23

0

파일 보기

파일: myflorida.py 프로젝트: vijayaraju/everyblock-1

 def parse_list(self, file_obj):
     # Unzip the file. Although it has an .exe extension, we can treat it
     # just like a ZIP file.
     zf = zipfile.ZipFile(file_obj)
     if self.florida_csv_filename is None:
         csv_names = [
             n for n in zf.namelist() if n.lower().endswith('.csv')
         ]
         if len(csv_names) != 1:
             raise ScraperBroken('Found %s CSV file(s) in the zip' %
                                 len(csv_names))
         csv_filename = csv_names[0]
     else:
         csv_filename = self.florida_csv_filename
     csv_text = zf.read(csv_filename)
     # The data is in iso-8859-1 encoding, so we use UnicodeDictReader so
     # that it gets converted properly to Unicode objects.
     reader = UnicodeDictReader(StringIO(csv_text),
                                self.florida_csv_fieldnames,
                                encoding='iso8859-1')
     for row in reader:
         yield row

예제 #24

0

파일 보기

    def clean_detail_record(self, record):
        violations = []
        last_number = 0
        for vio_bits in detail_violations_re.finditer(
                record.pop('violations_html')):
            vio = vio_bits.groupdict()

            # This is a sanity check of the scraper regex to make sure we get
            # every violation. The source data gives a sequential integer
            # number to each violation, so we just make sure the numbers are
            # indeed sequential. If they're not, then our regex is too strict.
            number = int(vio.pop('number'))
            if number - last_number != 1:
                raise ScraperBroken('Did not detect violation #%s at %s' %
                                    (number - 1, record['detail_url']))
            last_number = number

            # Unescape the JavaScript string escaping.
            vio['description'] = vio['description'].replace(r"\'", "'")

            vio['result'] = re.sub(r'\s*<br>\s*', ', ', vio['result'])

            # Skip violations with an empty code. This happens if there are no
            # violations (in this case, the site displays a single, empty
            # violation).
            if vio['code']:
                # We can't just use the violation code to determine uniqueness
                # of the violation type, because sometimes there are violations
                # with the same codes but different descriptions. Here, we use
                # a combination of the code and description as the primary key
                # for the Lookup object. (This will be used by
                # get_or_create_lookup() later.)
                code_for_db = '%s %s' % (vio['code'], vio['description'])
                vio['code_for_db'] = code_for_db[:
                                                 255]  # Fit database column limit.
                violations.append(vio)
        record['violation_list'] = violations
        return record

예제 #25

0

파일 보기

파일: new_newsitem_list_detail.py 프로젝트: vijayaraju/everyblock-1

    def update_from_string(self, list_page):
        """
        For scrapers with has_detail=False, runs the equivalent of update() on
        the given string.

        This is useful if you've got cached versions of HTML that you want to
        parse.

        Subclasses should not have to override this method.
        """
        # TODO: Setting the page type should probably happen somewhere else.
        list_page.page_type = LIST_PAGE
        self.num_skipped = 0
        for list_record in self.parse_list(list_page.html):
            try:
                list_record = self.clean_list_record(list_record)
            except SkipRecord, e:
                self.num_skipped += 1
                self.logger.debug("Skipping list record for %r: %s " % (list_record, e))
                continue
            except ScraperBroken, e:
                # Re-raise the ScraperBroken with some addtional helpful information.
                raise ScraperBroken('%r -- %s' % (list_record, e))

예제 #26

0

파일 보기

    def save(self, old_record, list_record, detail_record):
        # Each status change only applies to a single license type (e.g.
        # "Winegrower"). The list page says which license type we're interested
        # in, but only the detail page has the description, so we have to use
        # one to look up the other.
        try:
            license = [t for t in detail_record['license_types'] if t['license_type'][:2] == list_record['type']][0]
        except IndexError:
            raise ScraperBroken('License type %r not found on detail page' % list_record['type'])

        license_type = self.get_or_create_lookup('type', license['license_type'][5:], list_record['type'])
        status = self.get_or_create_lookup('status', license['license_type_status'], license['license_type_status'])
        if not list_record.has_key('action'):
            list_record['action'] = '' # Status changes do not have actions
        action = self.get_or_create_lookup('action', list_record['action'], list_record['action'])

        if self.record_type.code == 'STATUS_CHANGE':
            old_status = self.get_or_create_lookup('old_status', list_record['status_from'], list_record['status_from'])
            new_status = self.get_or_create_lookup('new_status', list_record['status_to'], list_record['status_to'])
        else:
            # New licesnses and new application have no old status.
            old_status = self.get_or_create_lookup('old_status', 'None', 'NONE')
            new_status = self.get_or_create_lookup('new_status', list_record['status'], list_record['status'])

        try:
            qs = NewsItem.objects.filter(schema__id=self.schema.id, item_date=list_record['report_date'])
            qs = qs.by_attribute(self.schema_fields['page_id'], list_record['place_id'])
            qs = qs.by_attribute(self.schema_fields['type'], license_type.id)

            if self.record_type.code == 'STATUS_CHANGE':
                qs = qs.by_attribute(self.schema_fields['old_status'], old_status.id)
                qs = qs.by_attribute(self.schema_fields['new_status'], new_status.id)
            else:
                qs = qs.by_attribute(self.schema_fields['action'], action.id)

            old_record = qs[0]
        except IndexError:
            pass
        else:
            return # No need to save again, if this record already exists.

        title = '%s for %s' % (self.record_type.name, detail_record['business_name'] or detail_record['primary_owner'])

        attributes = {
            'page_id': list_record['place_id'],
            'address': detail_record['address'],
            'business_name': detail_record['business_name'],
            'original_issue_date': license['original_issue_date'],
            'expiration_date': license['expiration_date'],
            'type': license_type.id,
            'status': status.id,
            'license_number': list_record['license_number'],
            'primary_owner': detail_record['primary_owner'],
            'action': action.id,
            'record_type': self.record_type.id,
            'old_status': old_status.id,
            'new_status': new_status.id,
        }
        self.create_newsitem(
            attributes,
            title=title,
            url=detail_url(list_record['place_id']),
            item_date=license['status_date'],
            location_name=detail_record['address'],
        )

예제 #27

0

파일 보기

파일: retrieval.py 프로젝트: vijayaraju/everyblock-1

    def clean_list_record(self, record):
        record['category'] = record['category'].replace(u'\xa0', ' ').replace(
            self.license_city + ' ', '')

        try:
            add = record.pop('Business Location')
        except KeyError:
            add = record.pop('Current Business Location')
        record['address'], record['clean_address'] = clean_washington_address(
            add, self.license_city)
        if 'New Business Location' in record:
            record['new_address'] = clean_washington_address(
                record.pop('New Business Location'), self.license_city)[0]
        else:
            record['new_address'] = ''

        if 'Discontinued Date' in record:
            record['item_date'] = parse_date(record.pop('Discontinued Date'),
                                             '%m/%d/%Y')
        elif 'Approved Date' in record:
            record['item_date'] = parse_date(record.pop('Approved Date'),
                                             '%m/%d/%Y')
        elif 'Notification Date' in record:
            record['item_date'] = parse_date(record.pop('Notification Date'),
                                             '%m/%d/%Y')
        else:
            raise ScraperBroken("Didn't find a date in %r" % record)

        if 'Business Name' in record:
            record['business_name'] = record.pop('Business Name')
        elif 'Current Business Name' in record:
            record['business_name'] = record.pop('Current Business Name')
        else:
            record['business_name'] = ''

        if 'Applicant(s)' in record:
            record['applicant'] = record.pop('Applicant(s)')
        elif 'Current Applicant(s)' in record:
            record['applicant'] = record.pop('Current Applicant(s)')
        else:
            record['applicant'] = ''

        record['new_business_name'] = record.pop('New Business Name', '')
        record['new_applicant'] = record.pop('New Applicant(s)', '')

        license_types = record['Liquor License Type'].split('; ')
        license_types = [re.sub(r'^\d+,\s+', '', lt) for lt in license_types]
        record['license_types'] = [
            re.sub('^DIRECT SHIPMENT RECEIVER-(?:IN/OUT WA|IN WA ONLY)$',
                   'DIRECT SHIPMENT RECEIVER', lt) for lt in license_types
        ]

        try:
            record['title'] = {
                ('DISCONTINUED LIQUOR LICENSES', 'DISCONTINUED'):
                u'Liquor license discontinued for %s',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'ASSUMPTION'):
                u'%s applied to assume license',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'NEW APPLICATION'):
                u'%s applied for new liquor license',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'ADDED/CHANGE OF CLASS/IN LIEU'):
                u'%s applied for additional liquor license class',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'ADDED/CHANGE OF TRADENAME'):
                u'%s applied for trade name change',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'CHANGE OF CORPORATE NAME'):
                u'%s applied for corporate name change',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'CHANGE OF CORPORATE OFFICER'):
                u'%s applied to add or remove a corporate officer',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'CHANGE OF LOCATION'):
                u'%s applied for change of location',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'CHANGE OF LLC MEMBER'):
                u'%s applied to add or remove an LLC member',
                ('NEW LIQUOR LICENSE APPLICATIONS', 'IN LIEU'):
                u'%s applied to change liquor license class',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'ADDED FEES'):
                u'%s approved for addition of fees',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'ASSUMPTION'):
                u'%s approved to assume license',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'NEW APPLICATION'):
                u'%s approved for new liquor license',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'ADDED/CHANGE OF CLASS/IN LIEU'):
                u'%s approved for additional liquor license class',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'ADDED/CHANGE OF TRADENAME'):
                u'%s approved for trade name change',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'CHANGE OF CORPORATE NAME'):
                u'%s approved for corporate name change',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'CHANGE OF CORPORATE OFFICER'):
                u'%s approved to add or remove a corporate officer',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'CHANGE OF LOCATION'):
                u'%s approved for change of location',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'CHANGE OF LLC MEMBER'):
                u'%s approved to add or remove an LLC member',
                ('RECENTLY APPROVED LIQUOR LICENSES', 'IN LIEU'):
                u'%s approved to change liquor license class',
            }[(record['category'], record['Application Type'])]
        except KeyError:
            self.logger.warn('Got unsupported combo %r and %r',
                             record['category'], record['Application Type'])
            raise SkipRecord
        record['title'] = record['title'] % record['business_name']

        return record