# Got something?
        if ( ( author_organization ) and ( author_organization != "" ) ):
            
            # save it.
            print( "    - org. string: " + author_organization )
            current_article_author.organization_string = author_organization
            current_article_author.save()
            
        #-- END check to see if there is an organization. --#
        
    #-- END check to see if we have an author string --#

#-- END loop over authros with no organization_string --#

# set stop time.
my_summary_helper.set_stop_time()

# add info. to summary outputter.
my_summary_helper.set_prop_value( "org_string_counter", org_string_counter )
my_summary_helper.set_prop_desc( "org_string_counter", "Found org. string" )

my_summary_helper.set_prop_value( "article_source_counter", article_source_counter )
my_summary_helper.set_prop_desc( "article_source_counter", "Using Article Source" )

my_summary_helper.set_prop_value( "bad_author_string_counter", bad_author_string_counter )
my_summary_helper.set_prop_desc( "bad_author_string_counter", "Bad Author String" )

# generate summary string.
summary_string += my_summary_helper.create_summary_string( item_prefix_IN = "==> " )
print( summary_string )
Пример #2
0
    	DjangoMemoryHelper.free_memory()
    	
    	# reset mem_counter
    	mem_counter = 0
    	
    #-- END check to see if we manage memory now --#
    
#-- END loop over articles --#

# End of loop book-keeping.
# set stop time.
my_summary_helper.set_stop_time()

# add info. to summary outputter.
my_summary_helper.set_prop_value( "article_count", article_counter )
my_summary_helper.set_prop_desc( "article_count", "Article count" )

my_summary_helper.set_prop_value( "done_counter", done_counter )
my_summary_helper.set_prop_desc( "done_counter", "Article DONE count" )

my_summary_helper.set_prop_value( "no_text_list", ", ".join( map( str, no_text_list ) ) )
my_summary_helper.set_prop_desc( "no_text_list", "Articles with no text" )

my_summary_helper.set_prop_value( "multiple_text_list", ", ".join( map( str, multiple_text_list ) ) )
my_summary_helper.set_prop_desc( "multiple_text_list", "Articles with multiple text" )

my_summary_helper.set_prop_value( "no_raw_data_list", ", ".join( map( str, no_raw_data_list ) ) )
my_summary_helper.set_prop_desc( "no_raw_data_list", "Articles with no raw data" )

my_summary_helper.set_prop_value( "multiple_raw_data_list", ", ".join( map( str, multiple_raw_data_list ) ) )
my_summary_helper.set_prop_desc( "multiple_raw_data_list", "Articles with multiple raw data" )
# set bulk collection flag (defaults to True)
#reddit_collector.do_bulk_create = False

# initialize summary helper
my_summary_helper = SummaryHelper()

# get post QuerySet
#post_qs = reddit_collect.models.Post.objects.filter( reddit_id = reddit_post_id )
post_qs = reddit_collect.models.Post.objects.filter( reddit_id__in = [ '1cp0i3', '1d67nv' ] )

# num_comments?
django_post = post_qs[ 0 ]
print( "==> num_comments: " + str( django_post.num_comments ) ) # 115, at time of collection

my_summary_helper.set_prop_value( "num_comments", django_post.num_comments )
my_summary_helper.set_prop_desc( "num_comments", "num_comments (post)" )
    
# pass the QuerySet to the collect_comments() method.
reddit_collector.collect_comments( post_qs )

#================================================================================
# Now, compare collect_comments() output to just grabbing comments with reddiwrap
#================================================================================

# refresh post.
# reddit_post_id = '1cp0i3'
django_post = reddit_collect.models.Post.objects.get( reddit_id = reddit_post_id )

# get reddiwrap post, so we can pull comments from reddit.
comment_rw_post = django_post.create_reddiwrap_post()
        #-- END check to see if related post passed in. --#
    
    #-- END loop over comments. --#
    
    # update aggregate counter
    aggregate_counter += comment_counter

    # memory management
    gc.collect()
    django.db.reset_queries()

    # do query again to see if more to process.
    comment_qs = reddit_collect.models.Comment.objects.filter( subreddit = None )

    # get count of comments.
    comment_count = comment_qs.count()

    print( " - Bottom of loop - After count() - processed " + str( aggregate_counter ) + " of " + str( total_comment_count ) + " - " + str( comment_count ) + " to go - " + str( datetime.datetime.now() ) )

#-- END loop over all comments. --#

# set stop time.
my_summary_helper.set_stop_time()

# set aggregate comment counter
my_summary_helper.set_prop_value( "aggregate_comment_count", aggregate_counter )
my_summary_helper.set_prop_desc( "aggregate_comment_count", "Comments Processed" )

# generate summary string.
summary_string += my_summary_helper.create_summary_string( item_prefix_IN = "==> " )
print( summary_string )
Пример #5
0
    def code_article_data( self, query_set_IN ):

        """
            Accepts query set of Articles.  Creates a new instance of the
               ArticleCoder class for coder_type passed in, places the query set
               in it, sets up its instance variables appropriately according to
               the request, then codes the attribution in the articles using the
               coder class.  Returns status message.  Results in Article_Data for
               each attribution detected by the coder in each article.  Checks
               for the attribution to already have been detected using article,
               paragraph number, etc.  If so, does not create an additional
               Article_Data instance (could add a flag for this later if
               needed...).
            Preconditions: assumes that we have a query set of Articles passed
               in that we can store in the instance.  If not, does nothing,
               returns empty string.
            Postconditions: Returns status message.  Results in Article_Data for
               each attribution detected by the coder in each article.  Checks
               for the attribution to already have been detected using article,
               paragraph number, etc.  If so, does not create an additional
               Article_Data instance (could add a flag for this later if
               needed...).

            Parameters:
            - query_set_IN - django HTTP request instance that contains parameters we use to generate network data.

            Returns:
            - String - Status message.
        """

        # return reference
        status_OUT = ''

        # declare variables
        me = "code_article_data"
        logging_message = ""
        my_logger = None
        do_i_print_updates = False
        my_summary_helper = None
        summary_string = ""
        article_coder = None
        param_dict = {}
        current_status = ""
        my_exception_helper = None
        exception_message = ""
        
        # rate-limiting variables
        am_i_rate_limited = False
        continue_work = True
        
        # auditing variables
        article_counter = -1
        exception_counter = -1
        error_counter = -1
        
        # grab a logger.
        my_logger = self.get_logger()
        
        # do I print some status?
        do_i_print_updates = self.do_print_updates
        
        # initialize summary helper
        my_summary_helper = SummaryHelper()
        
        # init rate-limiting
        am_i_rate_limited = self.do_manage_time

        # do we have a query set?
        if ( query_set_IN ):

            # create instance of ArticleCoder.
            article_coder = self.get_coder_instance()

            # initialize ArticleCoder instance from params.
            
            # Get parent parameter container.
            my_params = self.get_param_container()
            
            # retrieve the inner dictionary.
            param_dict = my_params.get_parameters()
            
            # use the dictionary from the param container to initialize.
            article_coder.initialize_from_params( param_dict )

            # loop on the article list, passing each to the ArticleCoder for
            #    processing.
            article_counter = 0
            exception_counter = 0
            error_counter = 0
            continue_work = True
            for current_article in query_set_IN:
            
                # OK to continue work?
                if ( continue_work == True ):

                    # increment article counter
                    article_counter += 1
                    
                    # rate-limited?
                    if ( am_i_rate_limited == True ):
                    
                        # yes - start timer.
                        self.start_request()
                    
                    #-- END pre-request check for rate-limiting --#
                    
                    # a little debugging to start
                    logging_message = "\n\n============================================================\n==> article " + str( article_counter ) + ": " + str( current_article.id ) + " - " + current_article.headline
                    my_logger.info( logging_message )
                    
                    # print?
                    if ( do_i_print_updates == True ):
                    
                        print( logging_message )
                        
                    #-- END check to see if we print a message.
                    
                    # add per-article exception handling, so we can get an idea of how
                    #    many articles cause problems.
                    try:
                
                        # code the article.
                        current_status = article_coder.code_article( current_article )
                        
                        # record status
                        self.record_article_status( current_article.id, current_status )
                        
                        # success?
                        if ( current_status != ArticleCoder.STATUS_SUCCESS ):
                        
                            # nope.  Error.
                            error_counter += 1
                            
                            logging_message = "======> In " + me + "(): ERROR - " + current_status + "; article = " + str( current_article )
                            my_logger.debug( logging_message )
                            
                            # print?
                            if ( do_i_print_updates == True ):
                            
                                print( logging_message )
                                
                            #-- END check to see if we print a message.
                            
                        #-- END check to see if success --#
                        
                    except Exception as e:
                        
                        # increment exception_counter
                        exception_counter += 1
                        
                        # get exception helper.
                        my_exception_helper = self.get_exception_helper()
                        
                        # log exception, no email or anything.
                        exception_message = "Exception caught for article " + str( current_article.id )
                        my_exception_helper.process_exception( e, exception_message )
                        
                        logging_message = "======> " + exception_message + " - " + str( e )
                        my_logger.debug( logging_message )
                        
                        # print?
                        if ( do_i_print_updates == True ):
                        
                            print( logging_message )
                            
                        #-- END check to see if we print a message.
                        
                        # record status
                        self.record_article_status( current_article.id, logging_message )

                    #-- END exception handling around individual article processing. --#
                
                    # rate-limited?
                    if ( am_i_rate_limited == True ):
                    
                        # yes - check if we may continue.
                        continue_work = self.may_i_continue()
                    
                    #-- END post-request check for rate-limiting --#
                    
                else:
                
                    # not OK to continue work.  Break?
                    #break
                    pass
                
                #-- END check to see if OK to continue.  If not... --#
                
            #-- END loop over articles --#

            # add some debug?
            if ( ArticleCoder.DEBUG_FLAG == True ):

                # yup.
                status_OUT += "\n\n" + article_coder.debug + "\n\n"

            #-- END check to see if we have debug to output. --#

        #-- END check to make sure we have a query set. --#
        
        # add stuff to summary and print the results.

        # set stop time
        my_summary_helper.set_stop_time()

        # add stuff to summary
        my_summary_helper.set_prop_value( "article_counter", article_counter )
        my_summary_helper.set_prop_desc( "article_counter", "Articles processed" )

        my_summary_helper.set_prop_value( "error_counter", error_counter )
        my_summary_helper.set_prop_desc( "error_counter", "Error count" )

        my_summary_helper.set_prop_value( "exception_counter", exception_counter )
        my_summary_helper.set_prop_desc( "exception_counter", "Exception count" )

        # output - set prefix if you want.
        summary_string += my_summary_helper.create_summary_string( item_prefix_IN = "==> " )
        my_logger.info( summary_string )
        
        # output summary string as status.
        status_OUT += summary_string

        return status_OUT
Пример #6
0
                match_counter += 1

                # print the match.
                print("        - archive ID Match ( edition: " + match_article.edition + " ): " + str(match_article))

            # -- END loop over matches --#

        # -- END check to see if one or more id matches. --#

    # -- END check to see if matches based on archive ID. --#

# -- END loop over article list --#

# add info. to summary outputter.
my_summary_helper.set_prop_value("single_match_counter", single_match_counter)
my_summary_helper.set_prop_desc("single_match_counter", "Single Match")

my_summary_helper.set_prop_value("multi_match_counter", multi_match_counter)
my_summary_helper.set_prop_desc("multi_match_counter", "Multiple Match")

my_summary_helper.set_prop_value("no_match_counter", no_match_counter)
my_summary_helper.set_prop_desc("no_match_counter", "No Match")

my_summary_helper.set_prop_value("archive_match_counter", archive_match_counter)
my_summary_helper.set_prop_desc("archive_match_counter", "No Match, but archive ID found")

# set stop time.
my_summary_helper.set_stop_time()

# generate summary string.
summary_string += my_summary_helper.create_summary_string(item_prefix_IN="==> ")