def __attrs_post_init__(self): # This is the default string that is formatted to contain the given # letter for this specific parser. This will (well, should) throw # an error if self.candidate_letter is improperly defined or # undefined after the class object is initialized. self.url = f'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName={self.candidate_letter}&FirstName=&Method=0' self.navigator = SeleniumNavigator()
def __attrs_post_init__(self): self.search_results_urls = [ 'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=a&FirstName=&Method=0' ] # self.search_results_urls = (f'http://media.ethics.ga.gov/search/\ # Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=\ # {character}&FirstName=&Method=0' for character in string.ascii_lowercase) self.navigator = SeleniumNavigator()
def __attrs_post_init__(self): self.search_results_urls = \ [f'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName={self.letter}&FirstName=&Method=0'] # 'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=x&FirstName=&Method=0', # 'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=z&FirstName=&Method=0'] # self.search_results_urls = # (f'http://media.ethics.ga.gov/search/Campaign/Campaign\ # _Namesearchresults.aspx?CommitteeName=&LastName={character}&FirstName=&Method=0' # for character in string.ascii_lowercase) logging.info(self.letter) logging.info(self.search_results_urls) self.navigator = SeleniumNavigator(letter=self.letter)
class FirstPassCrawler: session = attr.ib() candidate_letter = attr.ib() navigator = attr.ib(init=False) def __attrs_post_init__(self): # This is the default string that is formatted to contain the given # letter for this specific parser. This will (well, should) throw # an error if self.candidate_letter is improperly defined or # undefined after the class object is initialized. self.url = f'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName={self.candidate_letter}&FirstName=&Method=0' self.navigator = SeleniumNavigator() def exit(self): self.session.close() # 3-1.1: Same thingas reports and office get_or_add functions. def get_or_add_candidate(self, candidate): try: query_result = self.session.query(Candidate).filter_by( FilerId=candidate['FilerId'], Firstname=candidate['Firstname'], Lastname=candidate['Lastname']).first() if query_result: return query_result.CandidateId candidate = Candidate(**candidate) self.session.add(candidate) self.session.commit() return candidate.CandidateId except Exception as e: self.session.rollback() logging.info(e) # 4.1: Same thing for reports as get_or_add_office, but for reports. def get_or_add_report(self, report): try: query_result = self.session.query(Report).filter_by( Url=report['Url']).first() if query_result: return query_result.ReportId report = Report(**report) self.session.add(report) self.session.commit() return report.ReportId except Exception as e: logging.info(e) self.session.rollback() # 2.1: Either adds the office for which a candidate has run/is running or # finds that it already exists and then returns the ID of said office. def get_or_add_office(self, office): try: query_result = self.session.query(Office).filter_by( Name=office.Name).first() if query_result: return query_result.OfficeId self.session.add(office) self.session.commit() return office.OfficeId except Exception as e: self.session.rollback() logging.info(e) # 3-2: Crawls campaign finance reports tables and controls the interaction # with the dropdowns and buttons that expose the table to allow Selenium # to effectively scrape the tables and navigate to Reference Pages. def crawl_reports_table(self, candidate_id): dropdown = DropdownParser(self.navigator.page_source()) if dropdown.parse() is not None: try: self.navigator.click_dropdown() parser = ReportsTableParser(self.navigator.page_source()) for report_link, report in parser.parse(): if report_link is None: logging.info('No report found.') continue try: self.navigator.wait_for_it(report_link) self.navigator.click_link(report_link) self.navigator.wait_for_contributions_id() report['CandidateId'] = candidate_id report['Url'] = self.navigator.get_current_url() self.get_or_add_report(report) self.navigator.back() self.navigator.click_dropdown() except Exception as e: logging.info(e) logging.info(f'Report link id: {report_link}') except Exception as e: logging.info(e) # 3-1: Essentially encapsulates the parser for the campaign registration # information. def crawl_registration_info(self, candidate): parser = CandidateRegistrationParser(self.navigator.page_source()) ret_candidate = parser.parse(candidate) return self.get_or_add_candidate(ret_candidate) # 2: This crawls basic data from the candidate profiles. Information like # the offices for which they are running, dropdown links associated with # each office that exposes the campaign finance reports, as well as basic # registration info like address, political party, etc. def crawl_candidate_profile(self, url, candidate): parser = CandidateProfileParser(self.navigator.page_source()) for dropdown, office, current_candidate in parser.parse(candidate): if dropdown is None: logging.info( f"No dropdown for {current_candidate['Firstname']} " f"{current_candidate['Lastname']}") office_id = self.get_or_add_office(office) current_candidate['OfficeId'] = office_id self.crawl_registration_info(current_candidate) continue office_id = self.get_or_add_office(office) current_candidate['OfficeId'] = office_id self.navigator.expose_dropdown(dropdown) candidate_id = self.crawl_registration_info(current_candidate) try: self.crawl_reports_table(candidate_id) except Exception as e: logging.info(e) self.navigator.navigate(url) # 1: This crawls the candidate profile links for the given url parameter. # The SearchResultsParser parses out candidate name, bundling the name as # a dictionary and navigating to each candidate's individual profile page. def crawl_candidate_profile_links(self, url): self.navigator.navigate(url) parser = SearchResultsParser(self.navigator.page_source()) for candidate, current_link in parser.parse(): if current_link is None: continue logging.info(f"Navigating to {candidate['Firstname']} " f"{candidate['Lastname']}") try: self.navigator.wait_for_it(current_link) self.navigator.click_link(current_link) self.crawl_candidate_profile(url, candidate) except Exception as e: # Maybe include a self.navigator.navigate(url) call here in # case page doesn't load correctly logging.info(e) # 0: The method called in app.py that actually executes the process of # running the crawler and scraping the website. def crawl(self): try: self.crawl_candidate_profile_links(self.url) except Exception as e: logging.info(e) self.navigator.close_browser()
def __attrs_post_init__(self): logging.info('attrs post init called') self.navigator = SeleniumNavigator(loading_strategy='none', letter=self.letter) self.file_processor = FileProcessor(letter=self.letter)
class SecondPassCrawler: session = attr.ib() navigator = attr.ib(init=False) file_processor = attr.ib(init=False) letter = attr.ib() def __attrs_post_init__(self): logging.info('attrs post init called') self.navigator = SeleniumNavigator(loading_strategy='none', letter=self.letter) self.file_processor = FileProcessor(letter=self.letter) def exit(self): self.navigator.close_browser() self.session.close() def get_urls(self): _ids = self.session.query(Candidate).filter( Candidate.Lastname.ilike('zorn')).all() #ids_ = \ #self.session.query(Candidate).filter(Lastname.like("%z%")).all() reports = [] for _id in _ids: results = \ self.session.query(Report).filter_by(CandidateId=_id.CandidateId).all() logging.info(results) for result in results: reports.append((result.CandidateId, result.Url)) return reports #return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991'] def add_scrapelog_to_db(self, _id, url, content, dtime): slog = ScrapeLog(CandidateId=_id, ScrapeDate=dtime, RawData=content, PageURL=url) try: self.session.add(slog) self.session.commit() except Exception as e: self.session.rollback() logging.info(e) def crawl_download_link(self, _id): parser = CSVLinkParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') url = self.navigator.get_current_url() self.navigator.click_link(parsed_link) logging.info('Clicking download link for csv file.') content, dtime = self.file_processor.process() logging.info('Adding scrapelog to database') self.add_scrapelog_to_db(_id, url, content, dtime) self.file_processor.delete_csv() def crawl_view_contributions_ids(self, _id): logging.info(f'Current page: {self.navigator.get_current_url()}') parser = ContributionsViewParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') self.navigator.click_link(parsed_link) self.navigator.wait_for_csv_link() self.crawl_download_link(_id) def crawl(self): for _id, url in self.get_urls(): logging.info(f'Current url: {url}') self.navigator.navigate(url) self.navigator.wait_for_contributions_id() self.crawl_view_contributions_ids(_id)
def __attrs_post_init__(self): self.navigator = SeleniumNavigator(loading_strategy='none') self.file_processor = FileProcessor()
class SecondPassCrawler: session = attr.ib() navigator = attr.ib(init=False) file_processor = attr.ib(init=False) def __attrs_post_init__(self): self.navigator = SeleniumNavigator(loading_strategy='none') self.file_processor = FileProcessor() def exit(self): self.navigator.close_browser() self.session.close() def get_urls(self): # results = self.session.query(Report).all() # return (report.url for report in results) return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991'] def add_scrapelog_to_db(self, url, content, dtime): slog = ScrapeLog(scrape_date=dtime, raw_data=content, page_url=url) try: self.session.add(slog) self.session.commit() except Exception as e: self.session.rollback() logging.info(e) def crawl_download_link(self): parser = CSVLinkParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') url = self.navigator.get_current_url() self.navigator.click_link(parsed_link) logging.info('Clicking download link for csv file.') content, dtime = self.file_processor.process() self.add_scrapelog_to_db(url, content, dtime) self.file_processor.delete_csv() def crawl_view_contributions_ids(self): logging.info(f'Current page: {self.navigator.get_current_url()}') parser = ContributionsViewParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') self.navigator.click_link(parsed_link) self.navigator.wait_for_csv_link() self.crawl_download_link() def crawl(self): urls = self.get_urls() for url in urls: logging.info(f'Current url: {url}') self.navigator.navigate(url) self.navigator.wait_for_contributions_id() self.crawl_view_contributions_ids()
class FirstPassCrawler: session = attr.ib() candidate_list = attr.ib(init=False) navigator = attr.ib(init=False) letter = attr.ib() def __attrs_post_init__(self): self.search_results_urls = \ [f'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName={self.letter}&FirstName=&Method=0'] # 'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=x&FirstName=&Method=0', # 'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=z&FirstName=&Method=0'] # self.search_results_urls = # (f'http://media.ethics.ga.gov/search/Campaign/Campaign\ # _Namesearchresults.aspx?CommitteeName=&LastName={character}&FirstName=&Method=0' # for character in string.ascii_lowercase) logging.info(self.letter) logging.info(self.search_results_urls) self.navigator = SeleniumNavigator(letter=self.letter) def exit(self): self.session.close() # 3-1.1 def get_or_add_candidate(self, candidate): try: query_result = self.session.query(Candidate).filter_by( FilerId=candidate['FilerId'], Firstname=candidate['Firstname'], Lastname=candidate['Lastname']).first() if query_result: return query_result.CandidateId candidate = Candidate(**candidate) self.session.add(candidate) self.session.commit() return candidate.CandidateId except Exception as e: self.session.rollback() logging.info(e) # 4.1 def get_or_add_report(self, report): try: query_result = self.session.query(Report).filter_by( Url=report['Url']).first() if query_result: return query_result.ReportId report = Report(**report) self.session.add(report) self.session.commit() return report.ReportId except Exception as e: logging.info(e) self.session.rollback() # 2.1 def get_or_add_office(self, office): try: query_result = self.session.query(Office).filter_by( Name=office.Name).first() if query_result: return query_result.OfficeId self.session.add(office) self.session.commit() return office.OfficeId except Exception as e: self.session.rollback() logging.info(e) # 3-2 def crawl_reports_table(self, candidate_id): dropdown = DropdownParser(self.navigator.page_source()) if dropdown.parse() is not None: try: self.navigator.click_dropdown() parser = ReportsTableParser(self.navigator.page_source()) for report_link, report in parser.parse(): if report_link is None: logging.info('No report found.') continue try: self.navigator.wait_for_it(report_link) self.navigator.click_link(report_link) self.navigator.wait_for_contributions_id() report['CandidateId'] = candidate_id report['Url'] = self.navigator.get_current_url() self.get_or_add_report(report) self.navigator.back() self.navigator.click_dropdown() except Exception as e: logging.info(e) logging.info(f'Report link id: {report_link}') except Exception as e: logging.info(e) # 3-1 def crawl_registration_info(self, candidate): parser = CandidateRegistrationParser(self.navigator.page_source()) ret_candidate = parser.parse(candidate) return self.get_or_add_candidate(ret_candidate) # 2 def crawl_candidate_profile(self, url, candidate): parser = CandidateProfileParser(self.navigator.page_source()) for dropdown, office, current_candidate in parser.parse(candidate): if dropdown is None: logging.info( f"No dropdown for {current_candidate['Firstname']} " f"{current_candidate['Lastname']}") office_id = self.get_or_add_office(office) current_candidate['OfficeId'] = office_id self.crawl_registration_info(current_candidate) continue office_id = self.get_or_add_office(office) current_candidate['OfficeId'] = office_id self.navigator.expose_dropdown(dropdown) candidate_id = self.crawl_registration_info(current_candidate) try: self.crawl_reports_table(candidate_id) except Exception as e: logging.info(e) self.navigator.navigate(url) # 1 def crawl_candidate_profile_links(self, url): self.navigator.navigate(url) parser = SearchResultsParser(self.navigator.page_source()) for candidate, current_link in parser.parse(): if current_link is None: continue logging.info(f"Navigating to {candidate['Firstname']} " f"{candidate['Lastname']}") try: self.navigator.wait_for_it(current_link) self.navigator.click_link(current_link) self.crawl_candidate_profile(url, candidate) except Exception as e: logging.info(e) # 0 def crawl(self): for url in self.search_results_urls: try: self.crawl_candidate_profile_links(url) except Exception as e: logging.info(e) self.navigator.close_browser()
class Crawler: session = attr.ib() candidate_list = attr.ib(init=False) navigator = attr.ib(init=False) def __attrs_post_init__(self): self.search_results_urls = [ 'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=a&FirstName=&Method=0' ] # self.search_results_urls = (f'http://media.ethics.ga.gov/search/\ # Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=\ # {character}&FirstName=&Method=0' for character in string.ascii_lowercase) self.navigator = SeleniumNavigator() def exit(self): self.session.close() def add_candidate_to_db(self, candidate): try: self.session.add(candidate) self.session.commit() except Exception as e: logging.info(e) return candidate.id def add_office_to_db(self, office): try: self.session.add(office) self.session.commit() except Exception as e: logging.info(e) self.session.rollback() return office.id def add_report_to_db(self, report): try: self.session.add(report) self.session.commit() except Exception as e: logging.info(e) self.session.rollback() return report.id def crawl_reports_table(self, office_id): dropdown = DropdownParser(self.navigator.page_source()) if dropdown.parse() is not None: try: self.navigator.click_dropdown_initial() parser = ReportsTableParser(self.navigator.page_source()) res = parser.parse() for report_link, report in res: try: self.navigator.click_link(report_link) self.navigator.wait_for_contributions_id() report.url = self.navigator.get_current_url() report.office_id = office_id self.add_report_to_db(report) self.navigator.back() self.navigator.click_dropdown_subsequent() except Exception as e: logging.info(e) except Exception as e: logging.info(e) def crawl_candidate_profile(self, url, candidate): parser = CandidateProfileParser(self.navigator.page_source()) logging.info( f'Crawling page for {candidate.firstname} {candidate.lastname}') for dropdown_link, office in parser.parse(): if dropdown_link is None: continue candidate_id = self.add_candidate_to_db(candidate) office.candidate_id = candidate_id office_id = self.add_office_to_db(office) self.navigator.expose_dropdown(dropdown_link) try: self.crawl_reports_table(office_id) except Exception as e: logging.info(e) self.navigator.navigate(url) def crawl_candidate_profile_links(self, url): self.navigator.navigate(url) parser = SearchResultsParser(self.navigator.page_source()) for candidate, current_link in parser.parse(): self.navigator.click_link(current_link) try: self.crawl_candidate_profile(url, candidate) except Exception as e: logging.info(e) def crawl(self): for url in self.search_results_urls: logging.info(f'Crawling {url}') try: self.crawl_candidate_profile_links(url) except Exception as e: logging.info(e) self.navigator.close_browser()