def send_emails(emails_list): """Send emails based on a list of tuples in the form (address, subject, body, *attachments). Args: emails_list (list): A list of tuples, with each tuple containing the following data: - address (str): the destination email address. - subject (str): the subject line of the email. - body (str): the main body content of the email. - *attachments (str, optional): 0 or more attachment file locations. """ if not config.EMAILS_ENABLED: return None # Provide sender email password as second argument if not saving via 'set_email_login()' with yagmail.SMTP(config.SENDER_EMAIL) as server: print("\nSending {} email(s)...".format(len(emails_list))) valid_login = True for address, subject, body, *attachments in emails_list: # Re-attempt send if login failed and password updated via 'set_email_login()' while valid_login: try: server.send(to=address, subject=subject, contents=body, attachments=attachments) print("Email sent to {} (subject: {}).".format(address, subject)) time.sleep(0.5) # Break out of 'while valid_login' loop to move to next email break except smtplib.SMTPAuthenticationError as e: short_err_msg = "Email login failed. Please ensure that less secure app " \ "access is 'On' for the sender email Google account and " \ "that your saved login details are correct." print(short_err_msg) # Provide option of updating password and re-attempting send answer = input("Type 'y' and press Enter to update your saved password " "and re-attempt the login: "******"These can be updated using " "set_email_login().", trace=False) logs.log_error(err_msg) # Skip send attempts for any subsequent emails valid_login = False # Prevent all emails failing; log error and save unsent email except Exception as e: err_msg = logs.get_err_str(e, "Error sending email to {}.".format(address)) logs.log_error(err_msg) save_unsent_email(address, subject, body) # Break out of while loop to move to next email break
def download_robotstxt(self, max_attempts=5, wait=120): """Extract and return the current content (str) of the robots.txt file. Args: max_attempts (int): the maximum number of robots.txt URL connection attempts. wait (int): the number of seconds between connection attempts. """ robots_url = self.url + "robots.txt" for attempt in range(max_attempts): attempts_str = " Trying again in {} seconds. " \ "Attempt {} of {}.".format(wait, attempt + 1, max_attempts) try: headers = {'User-Agent': config.USER_AGENT} req = requests.get(robots_url, headers=headers, allow_redirects=False, timeout=40) except requests.exceptions.Timeout as e: err = "{} timed out before sending a valid response.".format( robots_url) if attempt < (max_attempts - 1): print(err + attempts_str) time.sleep(wait) else: # Final connection attempt failed self.err_message = logs.get_err_str(e, err, trace=False) raise except requests.exceptions.ConnectionError as e: err = "There was a connection error when accessing {}.".format( robots_url) if attempt < (max_attempts - 1): print(err + attempts_str) time.sleep(wait) else: # Final connection attempt failed self.err_message = logs.get_err_str(e, err) logs.admin_email_errors.append(self.err_message) raise else: # If no exceptions raised if req.status_code != 200: self.err_message = "{} returned a {} status code." \ "".format(robots_url, req.status_code) raise requests.exceptions.HTTPError # URL was successfully reached and returned a 200 status code return req.text
def main(): """Run all checks and handle fatal errors.""" try: sites_data = sites_from_file(config.MONITORED_SITES) RunChecks(sites_data).check_all() except Exception as fatal_err: # Fatal error during CSV read or RunChecks fatal_err_msg = logs.get_err_str(fatal_err, "Fatal error.") logs.log_error(fatal_err_msg) email_subject = "Robots.txt Check Fatal Error" email_content = "There was a fatal error during the latest robots.txt checks which " \ "caused the program to terminate unexpectedly." email_body = emails.get_admin_email_body(email_content) emails.admin_email.append( (config.ADMIN_EMAIL, email_subject, email_body)) finally: if config.EMAILS_ENABLED: emails.send_emails(emails.admin_email) else: print( "Note: emails are disabled. Details of the program run have been printed " "and/or logged. Set 'EMAILS_ENABLED' to equal 'True' to send/receive emails." ) logs.update_main_log("\n{}END OF RUN{}\n".format("-" * 20, "-" * 20), timestamp=False)
def run_check(self): """Update the robots.txt file records and check for changes. Returns: The class instance representing the completed robots.txt check. """ if self.err_message: # If error/invalid URL during __init__ return self try: extraction = self.download_robotstxt() self.update_records(extraction) if not self.first_run: self.check_diff() except Exception as e: # Anticipated errors caught in 'download_robotstxt()' and logged in 'self.err_message' if not self.err_message: self.err_message = logs.get_err_str( e, "Unexpected error during {} check." "".format(self.url)) logs.admin_email_errors.append(self.err_message) return self
def __init__(self, url): self.url = url # Updated where appropriate as the check progresses self.first_run = False self.err_message = None self.file_change = False # Use site domain name as directory name if self.url[:5] == 'https': self.dir = config.PATH + "data/" + self.url[8:-1] else: self.dir = config.PATH + "data/" + self.url[7:-1] self.old_file = self.dir + "/program_files/old_file.txt" self.new_file = self.dir + "/program_files/new_file.txt" # Content assigned during 'update_records()' after a successful check self.old_content = None self.new_content = None if (self.url[:4] != "http") or (self.url[-1] != "/"): self.err_message = "{} is not a valid site URL. The site URL must be absolute and " \ "end in a slash, e.g. 'https://www.example.com/'.".format(url) # If URL is valid and site directory doesn't exist, create required directories elif not os.path.isdir(self.dir): try: os.mkdir(self.dir) os.mkdir(self.dir + "/program_files") except Exception as e: self.err_message = logs.get_err_str( e, "Error creating {} directories." "".format(self.url)) logs.admin_email_errors.append(self.err_message)
def sites_from_file(file): """Extract monitored sites data from a CSV and return as a list of lists. Args: file (str): file location of a CSV file with the following attributes: - Header row labelling the three columns, as listed below. - url (col1): the absolute URL of the website homepage, with a trailing slash. - name (col2): the website's name identifier (letters/numbers only). - email (col3): the email address of the site admin, who will receive alerts. """ data = [] with open(file, 'r') as sites_file: csv_reader = csv.reader(sites_file, delimiter=',') row_num = 0 for row in csv_reader: # Skip the header row labels if row_num > 0: try: data.append([row[0], row[1], row[2]]) except Exception as e: err_msg = logs.get_err_str( e, "Couldn't extract row {} from CSV." "".format(row_num)) logs.log_error(err_msg) row_num += 1 return data
def check_site(self, site_attributes): """Run a robots.txt check and report for a single site. Attributes: site_attributes (list): a list representing a single site's attributes in the form [url, name, email]. Each attribute is detailed below. - url (str): the absolute URL of the website homepage, with a trailing slash. - name (str): the website's name identifier (letters/numbers only). - email (str): the email address of the site admin, who will receive alerts. """ try: url, name, email = site_attributes url = url.strip().lower() email = email.strip() check = RobotsCheck(url) check.run_check() if check.err_message: report = ErrorReport(check, name, email) self.error += 1 elif check.first_run: report = FirstRunReport(check, name, email) self.first_run += 1 elif check.file_change: report = ChangeReport(check, name, email) self.change += 1 else: report = NoChangeReport(check, name, email) self.no_change += 1 report.create_reports() # Prevent all site checks failing; log error to investigate except Exception as e: err_msg = logs.get_err_str( e, "Unexpected error for site: {}.".format(site_attributes)) logs.log_error(err_msg) email_subject = "Robots.txt Check Error" email_content = "There was an unexpected error while checking or reporting on the " \ "robots.txt file of a site which is associated with your email. " \ "If this is the first check, please ensure all site details " \ "were provided in the correct format. The error details are " \ "shown below.\n\n{}".format(err_msg) email_content = emails.replace_angle_brackets(email_content) email_body = emails.get_site_email_body(email_content) emails.site_emails.append( (site_attributes[2].strip(), email_subject, email_body)) self.error += 1