def aggregateData(self): # Create new directory for storing aggregated data # download_folder = path.abspath(path.join(self.download_dir, pardir)) aggregateFolder = DirManager(['aggregated_data']) aggregateFolder.createFolder() new_folder = aggregateFolder.getDirectory() new_csv_file = '{}/data.csv'.format(new_folder) insertColumsFolder = self.insertCandidateFolder.getDirectory() filenames = sorted([ insertColumsFolder + "/" + f for f in listdir(insertColumsFolder) ], key=path.getmtime) with open(new_csv_file, 'w') as new_aggregate_csv: new_worksheet = csv.writer(new_aggregate_csv, quoting=csv.QUOTE_ALL) # Loop through all workbooks (EXCEL) header = False for filename in filenames: # Open worksheet wb = xlrd.open_workbook(filename) sheet = wb.sheet_by_index(0) # Only pull excel header from the first file to reduce duplicates if header: for rownum in range(1, sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) else: for rownum in range(sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) header = True
def insertCandidates(self, numDownloads, CandidateName): print('Processing {} for {}'.format(numDownloads, CandidateName)) insertCandidateFolder = DirManager(['insertCandidateControlled']) insertCandidateFolder.createFolder() new_folder = insertCandidateFolder.getDirectory() filenames = sorted( [self.download_dir + "/" + f for f in listdir(self.download_dir)], key=path.getmtime) candidateHeader = "CandidateControlledName" for fullfilepathname in filenames[-numDownloads:]: filename = path.basename(fullfilepathname) wb = xlrd.open_workbook(fullfilepathname, logfile=open(devnull, 'w')) errordTypes = [ 'Cmte_ID', 'Intr_Nam L', 'Intr_City', 'Intr_ST', 'Off_S_H_Cd', 'XRef_Match' ] data = pd.read_excel( wb, dtype={datatype: str for datatype in errordTypes}) if CandidateName == " ": data.insert(0, candidateHeader, "Independent") else: data.insert(0, candidateHeader, CandidateName) data.to_excel('{}/{}'.format(new_folder, filename), index=False)
class Scraper: def __init__(self): self.DEFAULT_SLEEP_TIME = 5 self.SEARCH_FORM_ADDRESS = "https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx" # create data folder in current directory to store files self.website = SjcWebsite() self.new_dir = DirManager(["data"]) self.new_dir.createFolder() self.download_dir = self.new_dir.getDirectory() self.website.preprocessing = PreProcessing(self.download_dir) options = webdriver.ChromeOptions() # enable headless data retrieval isHeadless = os.environ.get('HEADLESS', True) if isHeadless: options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--window-size=1280,800") options.add_argument("--ignore-certificate-errors") options.add_argument("--test_type") options.add_argument("--no-sandbox") options.add_argument("start-maximized") options.add_argument("disable-infobars") options.add_argument("--disable-extensions") plugs = {"enabled": False, "name": "Chrome PDF Viewer"} prefs = { "download.default_directory": self.download_dir, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": False, "safebrowsing.disable_download_protection": True, "plugins.plugins_list": [plugs], } options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) def scrape(self, election_cycle=None): # Navigate to https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx self.website.navigateToSearchPage(self.driver, self.SEARCH_FORM_ADDRESS, election_cycle=election_cycle) self.website.verifySearchTableLoadComplete(self.driver) countFile = 0 for search_page_num in range(1, self.website.numPages(self.driver) + 1): print('PAGE {}'.format(search_page_num)) # Need to navigate to the page upfront so that when we get the number of entries on the page it is accurate. self.website.navigateToPage(self.driver, search_page_num) for entry_index in self.website.numTableEntries( self.driver, search_page_num ): print('INDEX {}'.format(entry_index)) # will result in the website bringing us back to page 1. self.website.navigateToPage(self.driver, search_page_num) self.website.extractTableData(self.driver, entry_index) self.website.clickEntryIndex(self.driver, entry_index % 10) sleep(self.DEFAULT_SLEEP_TIME) if self.website.errorDialogExists(self.driver): # If there are no forms for a specific entry, we get an error message. self.website.closeErrorDialog(self.driver) else: # If there are forms, then we will be brought to the "forms" page. self.website.verifyDownloadFormTableLoadComplete(self.driver) countFile = self.website.downloadExcel(self.driver, countFile) self.website.clickBackButton(self.driver) self.website.verifySearchTableLoadComplete(self.driver) # Close browser once scrape is complete self.driver.quit() # Custom module to aggregate data into single CSV self.website.preprocessing.aggregateData()
class PreProcessing(): def __init__(self, scraper_download_dir): download_file_dir_wildcard = '{}/*.xls'.format(scraper_download_dir) self.filenames = glob.glob(download_file_dir_wildcard) self.download_dir = scraper_download_dir def aggregateData(self): # Create new directory for storing aggregated data # download_folder = path.abspath(path.join(self.download_dir, pardir)) aggregateFolder = DirManager(['aggregated_data']) aggregateFolder.createFolder() new_folder = aggregateFolder.getDirectory() new_csv_file = '{}/data.csv'.format(new_folder) insertColumsFolder = self.insertCandidateFolder.getDirectory() filenames = sorted([ insertColumsFolder + "/" + f for f in listdir(insertColumsFolder) ], key=path.getmtime) with open(new_csv_file, 'w') as new_aggregate_csv: new_worksheet = csv.writer(new_aggregate_csv, quoting=csv.QUOTE_ALL) # Loop through all workbooks (EXCEL) header = False for filename in filenames: # Open worksheet wb = xlrd.open_workbook(filename) sheet = wb.sheet_by_index(0) # Only pull excel header from the first file to reduce duplicates if header: for rownum in range(1, sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) else: for rownum in range(sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) header = True def insertColumns(self, numDownloads, CandidateName, ElectionDate, BallotItem): print('Processing {} for {}'.format(numDownloads, CandidateName)) if numDownloads == 0: return self.insertCandidateFolder = DirManager(['insertedData']) self.insertCandidateFolder.createFolder() new_folder = self.insertCandidateFolder.getDirectory() filenames = self.insertColumnsHelper() candidateHeader = "CandidateControlledName" electionDateHeader = "Election Date" ballotItemHeader = "Ballot Item" print(filenames) for fullfilepathname in filenames[-numDownloads:]: filename = path.basename(fullfilepathname) print(filename) wb = xlrd.open_workbook(fullfilepathname, logfile=open(devnull, 'w')) errordTypes = [ 'Cmte_ID', 'Intr_Nam L', 'Intr_City', 'Intr_ST', 'Off_S_H_Cd', 'XRef_Match' ] data = pd.read_excel( wb, dtype={datatype: str for datatype in errordTypes}) if CandidateName == " ": data.insert(0, candidateHeader, "Independent") else: data.insert(0, candidateHeader, CandidateName) data.insert(0, electionDateHeader, ElectionDate) data.insert(0, ballotItemHeader, BallotItem) data.to_excel('{}/{}'.format(new_folder, filename), index=False) def insertColumnsHelper(self): partial_download = True filenames = sorted([ self.download_dir + FILE_DIVIDER + f for f in listdir(self.download_dir) ], key=path.getmtime) while partial_download: filename = path.basename(filenames[-1]) print(filename) if "transactionExportGrid" in filename and "crdownload" not in filename: partial_download = False else: sleep(3) filenames = sorted([ self.download_dir + FILE_DIVIDER + f for f in listdir(self.download_dir) ], key=path.getmtime) return filenames