def process_google_results(self, result): """ Get values from the geocoding results. https://developers.google.com/maps/documentation/geocoding/ intro#GeocodingResponses :param result: Results from Google Geocoding API ("results" list only). :type result: list :returns: This location's rating, latitude, longitude and ZIP code. :rtype: dict """ # TODO: Handle more than one returned location in result. # Could compare accuracies and use that to decide which to store. loc = result[0] values = { 'latitude': loc['geometry']['location']['lat'], 'longitude': loc['geometry']['location']['lng'], 'rating': loc['geometry']['location_type']} try: values['zip_code'] = loc['address_components'][7]['short_name'] except Exception: # TODO: More specific error. log.info("No zip code.") values['zip_code'] = "None" # TODO: Leave blank instead? return values
def login(self): """Load homepage, find login, enter credentials.""" self.load_homepage() # time.sleep(1.0) self.find_login_link() log.info('Sleep 1.0 second') time.sleep(1.0) self.enter_username() log.info('Sleep 1.0 second') time.sleep(1.0) self.enter_password() log.info('Sleep 5.0 seconds') time.sleep(5.0) try: self.driver.find_element_by_id("Header1_lnkLogout") log.info("Login successful") except Exception as error: log.info("Login failed") log.exception(error) raise
def geocode(self): """Update latitude, longitude, rating and ZIP in Locations table.""" print('\nGeocoding...') null_rating_rows = self.get_rows_with_null_rating() for row in null_rating_rows: full_address = "{0} {1}, New Orleans, LA".format( row.street_number, row.address) result = self.gmaps.geocode(full_address) if len(result) == 0: log.info('No geocoding results for: {}'.format(full_address)) # TODO: Need to also note failure so future geocoding scripts # don't keep trying and failing on the same addresses. # Possibly update Location's `rating` and/or Cleaned's # `location_publish` fields. continue details = self.process_google_results(result) try: with SESSION.begin_nested(): u = update(Location) u = u.values(details) u = u.where(Location.document_id == row.document_id) SESSION.execute(u) SESSION.flush() except Exception as error: # TODO: Handle specific errors. log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def cli(arguments): """Parse command-line arguments.""" # Catch any missed errors. if cli_has_errors(arguments): return if arguments['<single_date>']: # Single date early_date = arguments['<single_date>'] late_date = arguments['<single_date>'] log.info('Initializing single date: {}.'.format(early_date)) elif arguments['<early_date>'] and arguments['<late_date>']: # Date range early_date = arguments['<early_date>'] late_date = arguments['<late_date>'] log.info('Initializing date range: {0} to {1}.'.format( early_date, late_date)) # Check for errors early_datetime = datetime.strptime(early_date, "%Y-%m-%d") late_datetime = datetime.strptime(late_date, "%Y-%m-%d") if early_datetime > late_datetime: raise BadDateRangeError("The date range does not make sense.") DeleteDates(initial_date=early_date, until_date=late_date).main()
def rebuild_days(early_date, late_date): """Scrapes and initializes dates.""" print(early_date, late_date) # Build those newly scraped records. # This will set perm_flag = True in # checkPermanentStatusOfNewSales(). log.info('doitall')
def home(): """Receive a GET call for the homepage (/) and returns the view.""" data = Models().get_home() log.info(data) view = Views().get_home(data) log.info(view) return view
def rebuild_days(early_date, late_date): """Scrapes and initializes dates.""" print(early_date, late_date) # Build those newly scraped records. # This will set perm_flag = True in # checkPermanentStatusOfNewSales(). log.info('doitall')
def delete_permanent_date_range_file(): """Delete old most-recent-permanent-date-range/*.html.""" # Delete old file first log.info('Delete old most-recent-permanent-date-range/*.html file') file_string = "{}/data/most-recent-permanent-date-range/*.html".format( PROJECT_DIR) for file_path in glob.glob(file_string): os.remove(file_path)
def click_advanced_tab(self): """Click on the advanced tab.""" html_id = 'x:2130005445.2:mkr:ti1' log.info('Find advanced tab at HTML ID {}'.format(html_id)) advanced_tab_elem = self.driver.find_element_by_id(html_id) log.info('Click on advanced tab') advanced_tab_elem.click()
def click_search_button(self): """Click on the search button.""" html_id = 'cphNoMargin_SearchButtons2_btnSearch__1' log.info('Find search button at HTML ID {}'.format(html_id)) search_button_elem = self.driver.find_element_by_id(html_id) log.info('Click search button') search_button_elem.click()
def enter_username(self): """Type in username.""" html_id = 'Header1_txtLogonName' log.info('Find username field at HTML ID {}'.format(html_id)) username_elem = self.driver.find_element_by_id(html_id) log.info('Enter username from environment variable') username_elem.send_keys(os.environ.get('REAL_ESTATE_LRD_USERNAME'))
def find_login_link(self): """Find and click on login link.""" html_id = 'Header1_lnkLogin' log.info('Find login link at HTML ID {}'.format(html_id)) login_link_elem = self.driver.find_element_by_id(html_id) log.info('Click login link') login_link_elem.click()
def delete_permanent_date_range_when_scraped_file(year, month, day): """Delete old permanent-date-range-when-scraped*.html.""" log.info('Delete old permanent-date-range-when-scraped*.html') string = ("{0}/data/raw/{1}-{2}-{3}/" + "permanent-date-range-when-scraped*.html").format( PROJECT_DIR, year, month, day) for file_path in glob.glob(string): os.remove(file_path)
def page_not_found(error): """ Return an error page. :param error: The error message(?). :type error: not sure :returns: The view. """ log.info(error) view = Views().get_error_page() return view
def save_permanent_date_range_when_scraped_file(year, month, day, date_range_html, first_date, second_date): """Save new permanent-date-range-when-scraped*.html.""" # Save permanent date range for this individual sale. log.info('Save new permanent-date-range-when-scraped*.html file') individual_html_out = open( ("{0}/data/raw/{1}-{2}-{3}/" + "permanent-date-range-when-scraped_{4}-{5}.html").format( PROJECT_DIR, year, month, day, first_date, second_date), "wb") individual_html_out.write(date_range_html.encode('utf-8')) individual_html_out.close()
def find_permanent_date_range(self): """Parse search page for permanent date range.""" html_id = 'cphNoMargin_lblSearchSummary' log.info('Find permanent date range at HTML ID {}'.format(html_id)) date_range_elem = self.driver.find_element_by_id(html_id) match = re.match(r"Permanent Index From ([0-9/]*) to ([0-9/]*)", date_range_elem.text) first_date = match.group(1).replace('/', '') # 02/18/2014 second_date = match.group(2).replace('/', '') return first_date, second_date
def save_permanent_date_range_file(date_range_html, first_date, second_date): """Save new most-recent-permanent-date-range/*.html.""" log.info('Save new most-recent-permanent-date-range/*.html file') fn = "{0}/data/most-recent-permanent-date-range/{1}-{2}.html".format( PROJECT_DIR, first_date, second_date) if not os.path.exists(os.path.dirname(fn)): os.makedirs(os.path.dirname(fn)) overall_html_out = open(fn, "wb") overall_html_out.write(date_range_html.encode('utf-8')) overall_html_out.close()
def parse_results(self, year, month, day): """Parse initial result page for total number of sales.""" html_id = 'cphNoMargin_cphNoMargin_OptionsBar1_ItemList' try: log.info('Find results list at HTML ID {}'.format(html_id)) item_list_elem = self.driver.find_element_by_id(html_id) # log.info('Find option') options = item_list_elem.find_elements_by_tag_name("option") except Exception as error: log.info('No sales for this day') log.error(error, exc_info=True) html_out = '{}/data/raw/{}-{}-{}/page-html/page1.html'.format( PROJECT_DIR, year, month, day) with open(html_out, 'wb') as f_out: f_out.write((self.driver.page_source).encode('utf-8')) return total_pages = int(options[-1].get_attribute('value')) log.info('{0} pages of records for {1}-{2}-{3}'.format( total_pages, year, month, day)) for i in range(1, total_pages + 1): self.parse_page(i, year, month, day) log.info('Sleep 5.0 seconds') time.sleep(5.0)
def search_parameters(self, search_date): """Enter search parameters.""" self.click_advanced_tab() time.sleep(2.0) self.enter_date_filed_from(search_date) self.enter_date_filed_to(search_date) self.select_document_type() time.sleep(1.0) self.click_search_button() log.info('Sleep 5.0 seconds') time.sleep(5.0)
def scrape_days(early_date, late_date): """docstring""" early_datetime = datetime.strptime(early_date, '%Y-%m-%d') log.debug(early_datetime) late_datetime = datetime.strptime(late_date, '%Y-%m-%d') log.debug(early_datetime) # Scrape those days over again log.info('scrape') try: Scrape(initial_date=early_datetime, until_date=late_datetime).main() except Exception as error: log.error(error, exc_info=True)
def select_document_type(self): """Select SALE document type in dropdown.""" html_id = 'cphNoMargin_f_dclDocType_297' # SALE # TODO: Assert text is SALE log.info('Find document type SALE at HTML ID {}'.format(html_id)) doc_type_elem = self.driver.find_element_by_id(html_id) short_type = doc_type_elem.get_attribute('value') parent_elem = doc_type_elem.find_element_by_xpath('..') long_type = parent_elem.find_element_by_tag_name('label').text log.info('Document type is {} ({})'.format(long_type, short_type)) doc_type_elem.click()
def get_home(self): """ Get data for the homepage (/realestate/). :returns: Data for the homepage, such as date the app was last updated and a list of neighborhoods for the dropdown. """ update_date = self.get_last_updated_date() log.info(update_date) neighborhoods = self.get_neighborhoods() data = {'update_date': update_date, 'neighborhoods': neighborhoods} log.info(data) return data
def scrape_days(early_date, late_date): """docstring""" early_datetime = datetime.strptime(early_date, '%Y-%m-%d') log.debug(early_datetime) late_datetime = datetime.strptime(late_date, '%Y-%m-%d') log.debug(early_datetime) # Scrape those days over again log.info('scrape') try: Scrape( initial_date=early_datetime, until_date=late_datetime ).main() except Exception as error: log.error(error, exc_info=True)
def get_last_updated_date(self): """TODO.""" query = SESSION.query(Cleaned).filter( Cleaned.detail_publish.is_(True)).order_by( desc(Cleaned.document_recorded)).limit(1).all() log.info(query) updated_date = '' for row in query: updated_date = ymd_to_full_date( (row.document_recorded).strftime('%Y-%m-%d'), no_day=True) log.info(updated_date) SESSION.close() return updated_date
def parse_page(self, i, year, month, day): """Parse results page for sale document IDs.""" # Save table page log.info('Parse page {0} for {1}-{2}-{3}'.format( i, year, month, day, )) html_out = '{}/data/raw/{}-{}-{}/page-html/page{}.html'.format( PROJECT_DIR, year, month, day, i) with open(html_out, 'wb') as f_out: f_out.write((self.driver.page_source).encode('utf-8')) # TODO: Read from memory instead of new output file soup = BeautifulSoup(open(html_out), "html.parser") # log.info('Find all object IDs') # For this one page rows = soup.find_all('td', class_="igede12b9e") # List of Object IDs # First table row is empty log.info('{} records to scrape for this page'.format(len(rows) - 1)) for j in range(1, len(rows)): # overall_row = (i - 1) * 20 + j self.parse_sale(j, rows, year, month, day) url = 'http://onlinerecords.orleanscivilclerk.com/RealEstate/' + \ 'SearchResults.aspx' log.info('Load URL {}'.format(url)) self.driver.get(url) html_id = 'OptionsBar1_imgNext' log.info('Find next page button at HTML ID {}'.format(html_id)) next_button_elem = self.driver.find_element_by_id(html_id) log.info('Click next page button') next_button_elem.click()
def main(self): """Run Join() and Clean() scripts.""" log.info('Clean') print('Cleaning...') log.debug('get_rows_from_query') rows = Join(initial_date=self.initial_date, until_date=self.until_date).get_rows_from_query() log.debug('add_location_fields_temp_hack') rows = Join( initial_date=self.initial_date, until_date=self.until_date).add_location_fields_temp_hack(rows) log.debug('len(rows): %d', len(rows)) prepped_rows = self.prep_rows(rows) clean_rows = self.clean_rows(prepped_rows) self.commit_rows(clean_rows)
def cli(arguments): """Parse command-line arguments.""" # Catch any missed errors if cli_has_errors(arguments): return if arguments['<single_date>']: # Single date early_date = arguments['<single_date>'] late_date = arguments['<single_date>'] log.info('Initializing single date: {}.'.format(early_date)) elif arguments['<early_date>'] and arguments['<late_date>']: # Date range early_date = arguments['<early_date>'] late_date = arguments['<late_date>'] log.info('Initializing date range: {0} to {1}.'.format( early_date, late_date)) else: # No dates provided log.info('Initializing all dates that need it.') initialize() # Default: initialize all in need. return # Check for errors early_datetime = datetime.strptime(early_date, "%Y-%m-%d") late_datetime = datetime.strptime(late_date, "%Y-%m-%d") if early_datetime > late_datetime: raise BadDateRangeError("The date range does not make sense.") initialize(initial_date=early_date, until_date=late_date)
def cli(arguments): """Parse command-line arguments.""" # Catch any missed errors if cli_has_errors(arguments): return if arguments['<single_date>']: # Single date early_date = arguments['<single_date>'] late_date = arguments['<single_date>'] log.info('Initializing single date: {}.'.format(early_date)) elif arguments['<early_date>'] and arguments['<late_date>']: # Date range early_date = arguments['<early_date>'] late_date = arguments['<late_date>'] log.info('Initializing date range: {0} to {1}.'.format( early_date, late_date)) else: # No dates provided log.info('Initializing all dates that need it.') initialize() # Default: initialize all in need. return # Check for errors early_datetime = datetime.strptime(early_date, "%Y-%m-%d") late_datetime = datetime.strptime(late_date, "%Y-%m-%d") if early_datetime > late_datetime: raise BadDateRangeError("The date range does not make sense.") initialize(initial_date=early_date, until_date=late_date)
def parse_sale(self, j, rows, year, month, day): """Parse single sale page and save HTML.""" document_id = rows[j].string url = ('http://onlinerecords.orleanscivilclerk.com/RealEstate/' + 'SearchResults.aspx?global_id={}&type=dtl').format(document_id) try: log.info('Load sale URL {}'.format(url)) self.driver.get(url) except Exception: # TODO log.exception('Error loading sale URL {}'.format(url)) html = self.driver.page_source html_out = "{0}/data/raw/{1}-{2}-{3}/form-html/{4}.html".format( PROJECT_DIR, year, month, day, document_id) log.info('Save {}'.format(html_out)) with open(html_out, "wb") as f_out: f_out.write(html.encode('utf-8')) try: assert not self.is_error_page(html_out) # TODO: Read from memory except Exception: # TODO log.exception('Received error page') log.info('Deleting error page {}'.format(html_out)) os.remove(html_out)
def cli(arguments): """Parse command-line arguments.""" if cli_has_errors(arguments): return if arguments['<single_date>']: early_date = arguments['<single_date>'] late_date = arguments['<single_date>'] log.info('Scraping single date: {}'.format(early_date)) elif arguments['<early_date>'] and arguments['<late_date>']: early_date = arguments['<early_date>'] late_date = arguments['<late_date>'] log.info('Scraping date range: {0} to {1}'.format( early_date, late_date)) else: # No dates provided. Default is to scrape previous day. log.info('Scraping yesterday') Scrape().main() return # Check for errors early_datetime = datetime.strptime(early_date, '%Y-%m-%d') late_datetime = datetime.strptime(late_date, '%Y-%m-%d') if early_datetime > late_datetime: raise BadDateRangeError('Bad date range') Scrape(initial_date=early_date, until_date=late_date).main()
def main(self): """Run Join() and Clean() scripts.""" log.info('Clean') print('Cleaning...') log.debug('get_rows_from_query') rows = Join( initial_date=self.initial_date, until_date=self.until_date ).get_rows_from_query() log.debug('add_location_fields_temp_hack') rows = Join( initial_date=self.initial_date, until_date=self.until_date ).add_location_fields_temp_hack(rows) log.debug('len(rows): %d', len(rows)) prepped_rows = self.prep_rows(rows) clean_rows = self.clean_rows(prepped_rows) self.commit_rows(clean_rows)
def enter_date_filed_from(self, search_date): """Enter "date from".""" html_id = 'x:1221134975.0:mkr:3' log.info('Find "date filed from" field at HTML ID {}'.format(html_id)) date_file_from_elem = self.driver.find_element_by_id(html_id) log.info('Click on "date filed from" field') date_file_from_elem.click() log.info('Enter {} into "date filed from" field'.format(search_date)) date_file_from_elem.send_keys(search_date)
def enter_date_filed_to(self, search_date): """Enter "date to".""" html_id = 'x:96043147.0:mkr:3' log.info('Find "date filed to" field at HTML ID {}'.format(html_id)) date_file_to_elem = self.driver.find_element_by_id(html_id) log.info('Click on "date filed to" field') date_file_to_elem.click() log.info('Enter {} into "date filed to" field'.format(search_date)) date_file_to_elem.send_keys(search_date)
def enter_password(self): """Type in password.""" html_id = 'Header1_txtPassword' log.info('Find password field at HTML ID {}'.format(html_id)) password_elem = self.driver.find_element_by_id(html_id) log.info('Enter password from environment variable') password_elem.send_keys(os.environ.get('REAL_ESTATE_LRD_PASSWORD')) log.info('Press enter to submit credentials and log in') # Trigger search function. Don't use RETURN because PhantomJS fails. password_elem.send_keys(Keys.ENTER)
def logout(self): """Logout of site.""" url = 'http://onlinerecords.orleanscivilclerk.com/RealEstate/' + \ 'SearchEntry.aspx' # No matter which page you're on, you can go back here and logout. log.info('Load {}'.format(url)) self.driver.get(url) html_id = 'Header1_lnkLogout' log.info('Find logout button at HTML ID {}'.format(html_id)) logout_elem = self.driver.find_element_by_id(html_id) log.info('Click logout button') logout_elem.click()
def cycle_through_dates(self): """For each date in range, search, parse results and save HTML. TODO: Make this asynchronous. """ current_date = self.initial_date # Must search each date one at a time because there is a limit of # 300 results per search. A single day shouldn't reach that ceiling. while current_date != (self.until_date + timedelta(days=1)): year = current_date.strftime('%Y') # "2014" month = current_date.strftime('%m') # "09" day = current_date.strftime('%d') # "09" log.info('Search records for {}-{}-{}'.format(year, month, day)) # Check if folder for this day exists. If not, then make one. pagedir = "{0}/data/raw/{1}-{2}-{3}/page-html".format( PROJECT_DIR, year, month, day) formdir = "{0}/data/raw/{1}-{2}-{3}/form-html".format( PROJECT_DIR, year, month, day) if not os.path.exists(pagedir): log.info('Create directory {}'.format(pagedir)) os.makedirs(pagedir) if not os.path.exists(formdir): log.info('Create directory {}'.format(formdir)) os.makedirs(formdir) search_date = '{}{}{}'.format(month, day, year) # The meat of this loop self.navigate_search_page(year, month, day) self.search_parameters(search_date) self.parse_results(year, month, day) current_date += timedelta(days=1)