def get_email_helper( self ): # return reference instance_OUT = None # get instance. instance_OUT = self.email_helper # got one? if ( not( instance_OUT ) ): # no. Create and store. self.email_helper = EmailHelper() # set the from and to addresses. self.email_helper.set_from_address( self.status_email_from ) self.email_helper.set_to_address( self.status_email_to ) # try again. If nothing this time, nothing we can do. Return it. instance_OUT = self.email_helper #-- END check to see if object is stored in instance --# return instance_OUT
class Collector( object ): ''' This class is a helper for Crawling and processing files. ''' #============================================================================ # Constants-ish #============================================================================ FORMAT_DATE_STRING = "%Y-%m-%d" HEADER_VARIABLE_NAME_USER_AGENT = "User-Agent" # constants to hold keys for error details ERROR_ID = "id" ERROR_ITEM = "item" ERROR_MESSAGE = "message" ERROR_EXCEPTION = "exception" # error output ERROR_OUTPUT_FILE = "file" ERROR_OUTPUT_DB = "database" ERROR_OUTPUT_NONE = "none" # defaults ERROR_UID_SYSTEM_ERROR = "SYSTEM_ERROR" #============================================================================ # Instance variables #============================================================================ # DEBUG and status variables debug = False debug_string = "" status_string = "" total_articles_processed = 0 current_id = None current_item = None current_date = datetime.datetime.now() # default to now. batch_identifier = datetime.datetime.now().strftime( "%Y.%m.%d-%H.%M.%S" ) item_list = [] id_to_item_map = {} # map docid to URL, just so we don't process articles twice (not sure if there is a chance of that, but can't hurt to parse the ID out). regex_docid = None # variable to hold BeautifulSoupHelper, if needed. bs_helper = None # error handling variables error_id_to_details_map = {} error_limit = -1 error_output_type = "none" system_error_count = 0 # status email variables email_helper = None status_email_from = "" status_email_to = "" status_email_server = "localhost" #============================================================================ # Instance methods #============================================================================ def add_error( self, item_IN = "", id_IN = "", error_message_IN = "", exception_IN = None, stack_trace_IN = "" ): # declare variables me = "add_error" item_id = "" my_error_map = None my_id_regex = None error_details = None my_output_type = "" output_error = False debug_message = "" # got an item? if ( item_IN ): # Do we have an ID passed in? if ( id_IN ): # we have an ID passed in. Use it. item_id = id_IN else: # no ID passed in. Try to parse it out from item. item_id = self.get_id_for_item( item_IN ) #-- END check to see if ID passed in. --# # if we have a doc ID if ( item_id ): # make model instance error_details = Import_Error() # populate error_details.unique_identifier = item_id error_details.archive_source = "Newsbank" error_details.item = item_IN error_details.message = error_message_IN error_details.item_date = self.current_date error_details.batch_identifier = self.batch_identifier # Got an exception? if ( exception_IN ): # output exception and stack trace. error_details.exception = str( exception_IN ) error_details.stack_trace = traceback.format_exc() elif ( stack_trace_IN ): # no exception, but stack trace passed in. error_details.stack_trace = stack_trace_IN #-- END check to see if exception --# # get output type my_output_type = self.error_output_type # output_error output_error = True #-- END check to see if there is an Item ID # no item. Got an error message? elif ( error_message_IN ): # got an error, but not about an item. Increment system error count. self.system_error_count += 1 # ID passed in? if ( id_IN ): # Yes, use it. item_id = id_IN else: # no, generate one. item_id = self.ERROR_UID_SYSTEM_ERROR + "-" + self.batch_identifier + "-" + str( self.system_error_count ) #-- END check to see if ID passed in. --# # populate an error instance. error_details = Import_Error() error_details.unique_identifier = item_id error_details.archive_source = "Newsbank" error_details.item = None error_details.message = error_message_IN error_details.item_date = self.current_date error_details.batch_identifier = self.batch_identifier # output_error output_error = True #-- END check to see if we have an item passed in --# # See if we are to output error if ( output_error == True ): # if type is database, save. if ( my_output_type == self.ERROR_OUTPUT_DB ): # it is database. Save. error_details.save() #-- Check if type is database --# # Add to the map. self.error_id_to_details_map[ error_details.unique_identifier ] = error_details # set debug message debug_message = "error added. ID: " + item_id + "; Item: " + item_IN + "; message: " + error_message_IN + "; exception: " + str( exception_IN ) else: # Nothing to output. Error. debug_message = "error adding error - add_error invoked, but nothing passed in, so no error added." #-- END check to see if we output error. --# # always check error limit and output debug message self.check_error_limit() self.output_debug( debug_message, "Collector." + me ) #-- END method add_error() --# def add_item_to_queue( self, item_IN, id_IN = "" ): # declare variables item_id = "" my_article_queue = None my_id_regex = None id_results = None # got a URL? if ( item_IN ): # Do we have an ID passed in? if ( id_IN ): # we have an ID passed in. Use it. item_id = id_IN else: # no ID passed in. Try to parse it out from item. item_id = self.get_id_for_item( item_IN ) #-- END check to see if ID passed in. --# # if we have a doc ID if ( item_id ): # we have a doc ID - add it to map. self.id_to_item_map[ item_id ] = item_IN #-- end check to see if we found a docid --# # Also add this item to the list. self.item_list.append( item_IN ) #-- END check to see if we have an item passed in --# #-- END method add_item_to_queue() --# def check_error_limit( self ): # declare variables my_error_count = -1 my_error_limit = -1 status_message = "" # first, see if we have a limit. my_error_limit = self.error_limit if ( my_error_limit >= 0 ): # got a limit. Get error count. my_error_count = self.get_error_count() # is count greater than or equal to limit? if ( my_error_count >= my_error_limit ): # yup. We are at or over limit. Send status message. status_message = "Error limit exceded: errors = " + str( my_error_count ) + "; limit = " + str( my_error_limit ) self.send_status_email( status_message, "Collector status update - error limit exceeded" ) #-- END check to see if error count GTE limit --# #-- END check to see if error limit. --# #-- END method check_error_limit() --# def collect( self, *args, **kwargs ): ''' This method is the main method that drive the collecting of data - it will do the work to build the list of files to process, then will call the method to process those files. For now, implemented in its entirety in the child classes, but eventually logic to invoke "build_article_list" and then "process_article_list" will move here. ''' # return reference status_OUT = "Success!" # declare variables current_date_time_string = "" status_message = "" # initialize self.initialize() current_date_time_string = datetime.datetime.now().strftime( "%Y.%m.%d-%H.%M.%S" ) # Use try-catch block to try to always die gracefully, output errors. try: # first, call the gather_articles() method. self.gather_items( *args, **kwargs ) # Then, see if there are any articles in queue to process. if ( len( self.id_to_item_map ) > 0 ): # there is something in the map. Process the article queue. self.process_item_queue() #-- END check to see if we have articles to process. --# except CollectorException as ce: # CollectorException - output it. status_message = "\nERROR: CollectorException caught, message: " + str( ce ) + "\n" + traceback.format_exc() print( status_message ) self.send_status_email( "Collection error at " + current_date_time_string + ".\n\nbatch ID: " + self.batch_identifier + "\n\nmessage:\n" + status_message, "Collector update - collection error - CollectorException caught at " + current_date_time_string + "." ) except Exception as e: # unknown exception. status_message = "\nERROR: Exception caught, message: " + str( e ) + "\n" + traceback.format_exc() print( status_message ) self.send_status_email( "Collection error at " + current_date_time_string + ".\n\nbatch ID: " + self.batch_identifier + "\n\nmessage:\n" + status_message, "Collector update - collection error - Exception caught at " + current_date_time_string + "." ) raise else: # No exceptions. self.send_status_email( "Collection completed successfully at " + current_date_time_string + ".\n\nbatch ID: " + self.batch_identifier, "Collector update - collection completed successfully at " + current_date_time_string + "." ) #-- END try-catch block --# # Output errors. self.output_errors() return status_OUT #--- END method collect() ---# def do_include_item( self, item_IN ): ''' This method accepts an item. Checks to see if it should be included or not. Returns true or false. Defaults to always returning true. Preconditions: Path to directory must actually point to a directory. Postconditions: None. ''' # return reference include_item_OUT = True return include_item_OUT #-- END function do_include_item() --# def gather_items( self, *args, **kwargs ): ''' This method is the main method for populating the article queue if the parent collect method above is used. ''' # return reference status_OUT = "" return status_OUT #-- END method gather_items() --# def get_bs_helper( self ): # return reference instance_OUT = None # get instance. instance_OUT = self.bs_helper # got one? if ( not( instance_OUT ) ): # no. Create and store. self.bs_helper = BeautifulSoupHelper() # try again. If nothing this time, nothing we can do. Return it. instance_OUT = self.bs_helper #-- END check to see if object is stored in instance --# return instance_OUT #-- END method get_bs_helper() --# def get_email_helper( self ): # return reference instance_OUT = None # get instance. instance_OUT = self.email_helper # got one? if ( not( instance_OUT ) ): # no. Create and store. self.email_helper = EmailHelper() # set the from and to addresses. self.email_helper.set_from_address( self.status_email_from ) self.email_helper.set_to_address( self.status_email_to ) # try again. If nothing this time, nothing we can do. Return it. instance_OUT = self.email_helper #-- END check to see if object is stored in instance --# return instance_OUT #-- END method get_email_helper() --# def get_error_count( self ): ''' Returns the number of errors in the nested error dictionary. If no dict, returns -1. preconditions: Need to have a nested error dict. If not, get -1. postconditions: None ''' # return reference value_OUT = -1 # declare variables error_map = None # get instance. error_map = self.error_id_to_details_map # got one? if ( error_map ): # yes. Return len( error_map ) value_OUT = len( error_map ) else: # no. return 0. value_OUT = -1 #-- END check to see if regex is stored in instance --# return value_OUT #-- END method get_error_count() --# def get_id_for_item( self, item_IN ): ''' Accepts an item, uses id regular expression to try to retrieve the ID. ''' # return reference id_OUT = -1 # declare variables my_id_regex = None id_results = None # got an item? if ( item_IN ): # yes. Try to parse out the id. my_id_regex = self.get_regex_id() if ( my_id_regex ): # see if we have a doc ID. id_results = my_id_regex.findall( item_IN ) self.output_debug( "- *** item id?: " + str( id_results ) + "\n" ) # item_id should be the first result. id_OUT = id_results[ 0 ] #-- END check to see if we have a regex. --# #-- END check to see if ID passed in. --# return id_OUT #-- END method get_id_for_item() --# def get_regex_id( self ): # return reference regex_OUT = None # get compiled regex. regex_OUT = self.regex_docid return regex_OUT #-- END method get_regex_id() --# def initialize( self ): # declare variables current_date_time = None now_string = "" # set batch identifier current_date_time = datetime.datetime.now() now_string = current_date_time.strftime( "%Y.%m.%d-%H.%M.%S" ) # set batch identifier to now_string self.batch_identifier = now_string #-- END method initialize() --# def output_debug( self, message_IN, me_IN = "", prefix_IN = "" ): ''' Accepts message string. If debug is on, passes it to print(). If not, does nothing for now. ''' # declare variables debug_message = "" # got a message? if ( message_IN ): # only print if debug is on. if ( self.debug == True ): # debug is on. For now, just print. debug_message = message_IN # got a me value? if ( me_IN ): debug_message = "In " + me_IN + ": " + debug_message #-- END check to see if routine name passed in. --# # Got a prefix? if ( prefix_IN ): debug_message = prefix_IN + debug_message #-- END check to see if prefix. --# print( debug_message ) #-- END check to see if debug is on --# #-- END check to see if message. --# #-- END method output_debug() --# def output_errors( self ): ''' Processes errors based on the error output settings in this instance. ''' # declare variables my_output_type = "" error_map = None error_keys = None error_count = -1 current_date_time = None now_string = "" error_file_name = "" error_file = None # variables for looping over errors error_id = None error_item = None error_message = None error_exception = None current_id = -1 current_details = "" import_error_instance = None # do we output errors? my_output_type = self.error_output_type if ( ( my_output_type ) and ( my_output_type != self.ERROR_OUTPUT_NONE ) ): # Need to add output of errors - count, and then pipe to separate error file? error_map = self.error_id_to_details_map # count error keys error_keys = error_map.keys() error_count = len( error_keys ) self.output_debug( "\n\nError Count: " + str( error_count ) ) # got any errors? if ( error_count > 0 ): # output error messages to separate file. current_date_time = datetime.datetime.now() now_string = current_date_time.strftime( "%Y.%m.%d-%H.%M.%S" ) # prepare error output if ( my_output_type == self.ERROR_OUTPUT_FILE ): # make error log file name. error_file_name = "errors-" + current_date_time.strftime( "%Y.%m.%d-%H.%M.%S" ) + ".log" self.output_debug( "\n\nError details written to file " + error_file_name ) elif ( my_output_type == self.ERROR_OUTPUT_DB ): self.output_debug( "\n\nError details written to sourcenet_import_error database table" ) #-- END getting error output set up. --# # if output type is file, output errors. If output type is # database, then have been outputting them as they came in, # so we don't lose them. # is output type file? if ( my_output_type == self.ERROR_OUTPUT_FILE ): # open file for writing with open( error_file_name, "w" ) as error_file: # write error count error_file.write( error_file_name + "\n\nError Count: " + str( error_count ) + "\n\n" ) # loop over errors. error_id = None error_item = None error_message = None error_exception = None for current_id, current_details in error_map.items(): # get values for parts of error. error_id = current_details.unique_identifier error_item = current_details.item error_message = current_details.message error_exception = current_details.exception # write error error_file.write( "\n- id = " + str( error_id ) + "; item = " + error_item + "; message = " + error_message + "; exception = " + str( error_exception ) ) #-- END loop over errors. --# #-- END with( error_file ) - close()s file at end of with. --# #-- END check to see if output to file system --# else: self.output_debug( "\n\nNo errors detected, so no errors to output!" ) #-- END debug --# #-- END check to see if there are any errors --# #-- END check to see if we output errors at all. --# #-- END method output_errors --# def process_item( self, item_id_IN, item_IN ): ''' This function is called on each item in the item queue. It should be overridden in the child method. It is called by process_item_queue(). ''' # return reference status_OUT = "" return status_OUT #-- END method process_item() --# def process_item_queue( self, clear_on_finish_IN = True ): ''' Loops over the items in the nested article queue (uses the contents of self.id_to_item_map as queue, in no particular order). For each article, for now, just pulls in the body of the HTML and stores it in database with status of "unparsed". Eventually, could build rudimentary BS parsing in right here, but for now, just need to get the data. Preconditions: must have already pulled over the article list page and grabbed the appropriate <ul> for processing. ''' # declare variables me = "process_item_queue" my_item_queue_map = None my_queue_size = -1 current_id = "" current_item = "" current_request = None current_connection = None my_output_directory = "" current_output_path = "" current_output_file = None current_page_contents = "" item_counter = 0 do_clear_queue_on_finish = True # set flag to clear queue do_clear_queue_on_finish = clear_on_finish_IN # anything in queue? my_item_queue_map = self.id_to_item_map my_queue_size = len( my_item_queue_map ) if ( my_queue_size > 0 ): for current_id, current_item in my_item_queue_map.items(): # increment counter and output debug. item_counter += 1 self.output_debug( "\n\n- item " + str( item_counter ) + " of " + str( my_queue_size ) + " ( total errors: " + str( self.get_error_count() ) + " ): " + current_item + " (" + current_id + ")" ) # store current ID and item self.current_id = current_id self.current_item = current_item # add in try/except block, so we can continue processing if # error. try: # got an item. Invoke the process_item method. self.process_item( current_id, current_item ) except Exception as e: # exception. Log it. self.add_error( current_item, current_id, "ERROR: Exception or child caught in Collector." + me + ".", e ) except: # exception not descended from Exception caught. Log a message. self.add_error( current_item, current_id, "ERROR: Exception not descended from Exception caught in Collector." + me + ".", None, traceback.format_exc() ) #-- END try/except block. #-- END loop over items. --# # Once queue is processed, do we clear it out? if ( do_clear_queue_on_finish == True ): # yes. Clear out map. self.item_list = [] self.id_to_item_map = {} #-- END check to see if we clear out the queue after processing it --# #-- END check to make sure we have a queue to process. --# #-- END method process_item_queue() --# def send_status_email( self, message_IN, subject_IN = "Collector status update" ): ''' If from and to are both defined, accepts subject and message, makes an email using them and the from and to, and then sends the email. ''' # return reference status_OUT = "Success!" # declare variables my_email_helper = None # first, see if we have from, to, and server. if ( ( self.status_email_from ) and ( self.status_email_to ) and ( self.status_email_server ) ): # got that. Got a message? if ( message_IN ): # got what we need. Get email helper. my_email_helper = self.get_email_helper() # Send message. my_email_helper.set_smtp_server_host( self.status_email_server ) my_email_helper.set_from_address( self.status_email_from ) my_email_helper.set_to_address( self.status_email_to ) my_email_helper.set_subject( subject_IN ) my_email_helper.set_message( message_IN ) my_email_helper.send_email() else: self.output_debug( "ERROR: No message passed in to Controller.send_status_email(). Not sure what to do." ) #-- END check to see if message passed in. --# #-- END check to make sure we have from and to addresses. --# return status_OUT
def email_initialize( self, smtp_host_IN = "localhost", smtp_port_IN = -1, smtp_use_ssl_IN = False, smtp_username_IN = "", smtp_password_IN = "", *args, **kwargs ): ''' Accepts properties that can be used to initialize an email helper instance. Initializes object, stores it in instance variable. ''' # declare variables my_email_helper = None # create email helper my_email_helper = EmailHelper() # set host. my_email_helper.set_smtp_server_host( smtp_host_IN ) # set port? if ( ( smtp_port_IN ) and ( smtp_port_IN != None ) and ( smtp_port_IN > 0 ) ): my_email_helper.set_smtp_server_port( smtp_port_IN ) #-- END check to see if port passed in. --# # use ssl? my_email_helper.set_smtp_server_use_SSL( smtp_use_ssl_IN ) # set username? if ( ( smtp_username_IN ) and ( smtp_username_IN != None ) and ( smtp_username_IN != "" ) ): my_email_helper.set_smtp_server_username( smtp_username_IN ) #-- END check to see if username passed in --# # set password? if ( ( smtp_password_IN ) and ( smtp_password_IN != None ) and ( smtp_password_IN != "" ) ): my_email_helper.set_smtp_server_password( smtp_password_IN ) #-- END check to see if password passed in --# # store in instance variable. self.email_helper = my_email_helper