示例#1
0
 def list_pages(self):
     workbook_path = self.retriever.get_to_file(self.get_excel_url())
     yield ExcelDictReader(workbook_path,
                           sheet_index=0,
                           header_row_num=0,
                           start_row_num=1)
     os.unlink(workbook_path)  # Clean up the temporary file.
示例#2
0
    def list_pages(self):
        for week_end_date in self.week_end_dates:
            # They've used a different URL scheme over time.
            if week_end_date <= datetime.date(2005, 5, 13):
                url = 'http://www.nyc.gov/html/dob/downloads/download/foil/sg%s.xls' % week_end_date.strftime('%m%d%y')
            else:
                url = 'http://www.nyc.gov/html/dob/downloads/excel/sg%s.xls' % week_end_date.strftime('%m%d%y')

            workbook_path = self.retriever.get_to_file(url)
            yield ExcelDictReader(workbook_path, sheet_index=0, header_row_num=2, start_row_num=3,
                use_last_header_if_duplicate=False)
            os.unlink(workbook_path) # Clean up the temporary file.
示例#3
0
    def list_pages(self):
        # Get the HTML page, which includes links to Excel files (one for each
        # borough). We do this instead of hard-coding the Excel file names in
        # the scraper because the Excel file names tend to change each month.
        url = 'http://www.nyc.gov/html/dof/html/property/property_val_sales.shtml'
        html = self.get_html(url)
        excel_links = re.findall(r'href="([^"]+\.xls)"', html)
        if len(excel_links) != 12:
            raise ScraperBroken('Got a strange number of Excel links: %s' % len(excel_links))

        # The first five links are the "New York City Sales Data" links,
        # which is what we want.
        for excel_link in excel_links[:5]:
            excel_url = urlparse.urljoin(url, excel_link)
            workbook_path = self.retriever.get_to_file(excel_url)
            reader = ExcelDictReader(workbook_path, sheet_index=0, header_row_num=4, start_row_num=5)
            yield reader
            os.unlink(workbook_path) # Clean up the temporary file.
示例#4
0
    def list_pages(self):
        for week_end_date in self.week_end_dates:
            # They've used a different URL scheme over time.
            if week_end_date <= datetime.date(2005, 5, 13):
                url = 'http://www.nyc.gov/html/dob/downloads/download/foil/job%s.xls' % week_end_date.strftime(
                    '%m%d%y')
            else:
                url = 'http://www.nyc.gov/html/dob/downloads/excel/job%s.xls' % week_end_date.strftime(
                    '%m%d%y')

            try:
                workbook_path = self.retriever.get_to_file(url)
                yield ExcelDictReader(workbook_path,
                                      sheet_index=0,
                                      header_row_num=2,
                                      start_row_num=3)
                os.unlink(workbook_path)  # Clean up the temporary file.
            except PageNotFoundError:
                self.logger.warn("Could not find %s" % url)
示例#5
0
 def list_pages(self):
     file_path = self.retriever.get_to_file(self.excel_url)
     reader = ExcelDictReader(file_path, sheet_index=0, header_row_num=0, start_row_num=1)
     yield reader
     os.unlink(file_path) # Clean up the temporary file.
示例#6
0
 def list_pages(self):
     reader = ExcelDictReader(self.excel_file_name,
                              sheet_index=0,
                              header_row_num=0,
                              start_row_num=1)
     yield reader