# Got something? if ( ( author_organization ) and ( author_organization != "" ) ): # save it. print( " - org. string: " + author_organization ) current_article_author.organization_string = author_organization current_article_author.save() #-- END check to see if there is an organization. --# #-- END check to see if we have an author string --# #-- END loop over authros with no organization_string --# # set stop time. my_summary_helper.set_stop_time() # add info. to summary outputter. my_summary_helper.set_prop_value( "org_string_counter", org_string_counter ) my_summary_helper.set_prop_desc( "org_string_counter", "Found org. string" ) my_summary_helper.set_prop_value( "article_source_counter", article_source_counter ) my_summary_helper.set_prop_desc( "article_source_counter", "Using Article Source" ) my_summary_helper.set_prop_value( "bad_author_string_counter", bad_author_string_counter ) my_summary_helper.set_prop_desc( "bad_author_string_counter", "Bad Author String" ) # generate summary string. summary_string += my_summary_helper.create_summary_string( item_prefix_IN = "==> " ) print( summary_string )
DjangoMemoryHelper.free_memory() # reset mem_counter mem_counter = 0 #-- END check to see if we manage memory now --# #-- END loop over articles --# # End of loop book-keeping. # set stop time. my_summary_helper.set_stop_time() # add info. to summary outputter. my_summary_helper.set_prop_value( "article_count", article_counter ) my_summary_helper.set_prop_desc( "article_count", "Article count" ) my_summary_helper.set_prop_value( "done_counter", done_counter ) my_summary_helper.set_prop_desc( "done_counter", "Article DONE count" ) my_summary_helper.set_prop_value( "no_text_list", ", ".join( map( str, no_text_list ) ) ) my_summary_helper.set_prop_desc( "no_text_list", "Articles with no text" ) my_summary_helper.set_prop_value( "multiple_text_list", ", ".join( map( str, multiple_text_list ) ) ) my_summary_helper.set_prop_desc( "multiple_text_list", "Articles with multiple text" ) my_summary_helper.set_prop_value( "no_raw_data_list", ", ".join( map( str, no_raw_data_list ) ) ) my_summary_helper.set_prop_desc( "no_raw_data_list", "Articles with no raw data" ) my_summary_helper.set_prop_value( "multiple_raw_data_list", ", ".join( map( str, multiple_raw_data_list ) ) ) my_summary_helper.set_prop_desc( "multiple_raw_data_list", "Articles with multiple raw data" )
# set bulk collection flag (defaults to True) #reddit_collector.do_bulk_create = False # initialize summary helper my_summary_helper = SummaryHelper() # get post QuerySet #post_qs = reddit_collect.models.Post.objects.filter( reddit_id = reddit_post_id ) post_qs = reddit_collect.models.Post.objects.filter( reddit_id__in = [ '1cp0i3', '1d67nv' ] ) # num_comments? django_post = post_qs[ 0 ] print( "==> num_comments: " + str( django_post.num_comments ) ) # 115, at time of collection my_summary_helper.set_prop_value( "num_comments", django_post.num_comments ) my_summary_helper.set_prop_desc( "num_comments", "num_comments (post)" ) # pass the QuerySet to the collect_comments() method. reddit_collector.collect_comments( post_qs ) #================================================================================ # Now, compare collect_comments() output to just grabbing comments with reddiwrap #================================================================================ # refresh post. # reddit_post_id = '1cp0i3' django_post = reddit_collect.models.Post.objects.get( reddit_id = reddit_post_id ) # get reddiwrap post, so we can pull comments from reddit. comment_rw_post = django_post.create_reddiwrap_post()
#-- END check to see if related post passed in. --# #-- END loop over comments. --# # update aggregate counter aggregate_counter += comment_counter # memory management gc.collect() django.db.reset_queries() # do query again to see if more to process. comment_qs = reddit_collect.models.Comment.objects.filter( subreddit = None ) # get count of comments. comment_count = comment_qs.count() print( " - Bottom of loop - After count() - processed " + str( aggregate_counter ) + " of " + str( total_comment_count ) + " - " + str( comment_count ) + " to go - " + str( datetime.datetime.now() ) ) #-- END loop over all comments. --# # set stop time. my_summary_helper.set_stop_time() # set aggregate comment counter my_summary_helper.set_prop_value( "aggregate_comment_count", aggregate_counter ) my_summary_helper.set_prop_desc( "aggregate_comment_count", "Comments Processed" ) # generate summary string. summary_string += my_summary_helper.create_summary_string( item_prefix_IN = "==> " ) print( summary_string )
def code_article_data( self, query_set_IN ): """ Accepts query set of Articles. Creates a new instance of the ArticleCoder class for coder_type passed in, places the query set in it, sets up its instance variables appropriately according to the request, then codes the attribution in the articles using the coder class. Returns status message. Results in Article_Data for each attribution detected by the coder in each article. Checks for the attribution to already have been detected using article, paragraph number, etc. If so, does not create an additional Article_Data instance (could add a flag for this later if needed...). Preconditions: assumes that we have a query set of Articles passed in that we can store in the instance. If not, does nothing, returns empty string. Postconditions: Returns status message. Results in Article_Data for each attribution detected by the coder in each article. Checks for the attribution to already have been detected using article, paragraph number, etc. If so, does not create an additional Article_Data instance (could add a flag for this later if needed...). Parameters: - query_set_IN - django HTTP request instance that contains parameters we use to generate network data. Returns: - String - Status message. """ # return reference status_OUT = '' # declare variables me = "code_article_data" logging_message = "" my_logger = None do_i_print_updates = False my_summary_helper = None summary_string = "" article_coder = None param_dict = {} current_status = "" my_exception_helper = None exception_message = "" # rate-limiting variables am_i_rate_limited = False continue_work = True # auditing variables article_counter = -1 exception_counter = -1 error_counter = -1 # grab a logger. my_logger = self.get_logger() # do I print some status? do_i_print_updates = self.do_print_updates # initialize summary helper my_summary_helper = SummaryHelper() # init rate-limiting am_i_rate_limited = self.do_manage_time # do we have a query set? if ( query_set_IN ): # create instance of ArticleCoder. article_coder = self.get_coder_instance() # initialize ArticleCoder instance from params. # Get parent parameter container. my_params = self.get_param_container() # retrieve the inner dictionary. param_dict = my_params.get_parameters() # use the dictionary from the param container to initialize. article_coder.initialize_from_params( param_dict ) # loop on the article list, passing each to the ArticleCoder for # processing. article_counter = 0 exception_counter = 0 error_counter = 0 continue_work = True for current_article in query_set_IN: # OK to continue work? if ( continue_work == True ): # increment article counter article_counter += 1 # rate-limited? if ( am_i_rate_limited == True ): # yes - start timer. self.start_request() #-- END pre-request check for rate-limiting --# # a little debugging to start logging_message = "\n\n============================================================\n==> article " + str( article_counter ) + ": " + str( current_article.id ) + " - " + current_article.headline my_logger.info( logging_message ) # print? if ( do_i_print_updates == True ): print( logging_message ) #-- END check to see if we print a message. # add per-article exception handling, so we can get an idea of how # many articles cause problems. try: # code the article. current_status = article_coder.code_article( current_article ) # record status self.record_article_status( current_article.id, current_status ) # success? if ( current_status != ArticleCoder.STATUS_SUCCESS ): # nope. Error. error_counter += 1 logging_message = "======> In " + me + "(): ERROR - " + current_status + "; article = " + str( current_article ) my_logger.debug( logging_message ) # print? if ( do_i_print_updates == True ): print( logging_message ) #-- END check to see if we print a message. #-- END check to see if success --# except Exception as e: # increment exception_counter exception_counter += 1 # get exception helper. my_exception_helper = self.get_exception_helper() # log exception, no email or anything. exception_message = "Exception caught for article " + str( current_article.id ) my_exception_helper.process_exception( e, exception_message ) logging_message = "======> " + exception_message + " - " + str( e ) my_logger.debug( logging_message ) # print? if ( do_i_print_updates == True ): print( logging_message ) #-- END check to see if we print a message. # record status self.record_article_status( current_article.id, logging_message ) #-- END exception handling around individual article processing. --# # rate-limited? if ( am_i_rate_limited == True ): # yes - check if we may continue. continue_work = self.may_i_continue() #-- END post-request check for rate-limiting --# else: # not OK to continue work. Break? #break pass #-- END check to see if OK to continue. If not... --# #-- END loop over articles --# # add some debug? if ( ArticleCoder.DEBUG_FLAG == True ): # yup. status_OUT += "\n\n" + article_coder.debug + "\n\n" #-- END check to see if we have debug to output. --# #-- END check to make sure we have a query set. --# # add stuff to summary and print the results. # set stop time my_summary_helper.set_stop_time() # add stuff to summary my_summary_helper.set_prop_value( "article_counter", article_counter ) my_summary_helper.set_prop_desc( "article_counter", "Articles processed" ) my_summary_helper.set_prop_value( "error_counter", error_counter ) my_summary_helper.set_prop_desc( "error_counter", "Error count" ) my_summary_helper.set_prop_value( "exception_counter", exception_counter ) my_summary_helper.set_prop_desc( "exception_counter", "Exception count" ) # output - set prefix if you want. summary_string += my_summary_helper.create_summary_string( item_prefix_IN = "==> " ) my_logger.info( summary_string ) # output summary string as status. status_OUT += summary_string return status_OUT
match_counter += 1 # print the match. print(" - archive ID Match ( edition: " + match_article.edition + " ): " + str(match_article)) # -- END loop over matches --# # -- END check to see if one or more id matches. --# # -- END check to see if matches based on archive ID. --# # -- END loop over article list --# # add info. to summary outputter. my_summary_helper.set_prop_value("single_match_counter", single_match_counter) my_summary_helper.set_prop_desc("single_match_counter", "Single Match") my_summary_helper.set_prop_value("multi_match_counter", multi_match_counter) my_summary_helper.set_prop_desc("multi_match_counter", "Multiple Match") my_summary_helper.set_prop_value("no_match_counter", no_match_counter) my_summary_helper.set_prop_desc("no_match_counter", "No Match") my_summary_helper.set_prop_value("archive_match_counter", archive_match_counter) my_summary_helper.set_prop_desc("archive_match_counter", "No Match, but archive ID found") # set stop time. my_summary_helper.set_stop_time() # generate summary string. summary_string += my_summary_helper.create_summary_string(item_prefix_IN="==> ")