def describe_data(tweets):
    """
    Evaluate the Tweets provided and return the results object
    """ 

    logger.info('Describing Tweets')
    results = tweet_evaluation_module.setup_analysis(conversation=True,audience=True)
    tweet_evaluation_module.analyze_tweets(tweets,results)
    return results
 def test_hashtag_count(self):
     """ inject hashtags with a predetermined number of test tokens """ 
     pass
     # configure results structure
     results = analysis.setup_analysis(do_conversation=True)
     # use counter for verification 
     counter = 0
     for tweet in self.tweets:
         tweet['twitter_entities']['hashtags'].append({"text":"notarandomhashtag"}) 
         analysis.analyze_tweet(tweet, results) 
         counter += 1
     self.assertEqual(results['hashtags']['notarandomhashtag'], counter)  
 def test_conversation_length(self):
     """
     Test count of Tweets in analysis code against known number
     """
     # configure results structure
     results = analysis.setup_analysis(do_conversation=True)
     # is this id necessary for testing?
     #results["unique_id"] = "TEST"
     # run analysis code (including conversation) 
     analysis.analyze_tweets(self.tweets, results)
     # ground truth from setUp() 
     self.assertEqual(results['tweet_count'],self.generator_length_truth)
 def test_conversation_length(self):
     """
     Test count of Tweets in analysis code against known number
     """
     # configure results structure
     results = analysis.setup_analysis(do_conversation=True)
     # is this id necessary for testing?
     #results["unique_id"] = "TEST"
     # run analysis code (including conversation)
     analysis.analyze_tweets(self.tweets, results)
     # ground truth from setUp()
     self.assertEqual(results['tweet_count'], self.generator_length_truth)
 def test_hashtag_count(self):
     """ inject hashtags with a predetermined number of test tokens """
     pass
     # configure results structure
     results = analysis.setup_analysis(do_conversation=True)
     # use counter for verification
     counter = 0
     for tweet in self.tweets:
         tweet['twitter_entities']['hashtags'].append(
             {"text": "notarandomhashtag"})
         analysis.analyze_tweet(tweet, results)
         counter += 1
     self.assertEqual(results['hashtags']['notarandomhashtag'], counter)
 def test_bio_term_count(self):
     """ inject bio with a predetermined number of test tokens """
     pass
     # configure results structure
     results = analysis.setup_analysis(do_audience=True)
     # use counter for verification 
     counter = 1
     for tweet in self.tweets:
         addition = " test_term"*counter
         tweet['actor']['summary'] += addition
         analysis.analyze_tweet(tweet, results) 
         counter += 1
     expected_test_count = int( next(results['bio_term_count'].get_tokens())[0] )
     self.assertEqual(expected_test_count, sum(range(counter)))
    def test_audience_length(self):
        pass
        # configure results structure
        results = analysis.setup_analysis(do_audience=True)
        # run analysis code (including audience) for user ids 
        analysis.analyze_tweets(self.tweets, results)  
        user_ids = results["tweets_per_user"].keys()

        # get ground truth (# unique user ids) from test data file
        p1 = subprocess.Popen(['cat', INPUT_FILE_NAME], stdout=subprocess.PIPE)
        p2 = subprocess.Popen(['python', '-c', 'import sys; import json; print(len(set([json.loads(i)["actor"]["id"] for i in sys.stdin])))'], stdin=p1.stdout, stdout=subprocess.PIPE)
        p1.stdout.close()
        out, err = p2.communicate()
        shell_user_count = int(out)
        self.assertEqual(shell_user_count, len(user_ids)) 
 def test_body_term_count(self):
     """ inject body with a predetermined number of test tokens """
     pass
     # configure results structure
     results = analysis.setup_analysis(do_conversation=True)
     # use counter for verification
     counter = 1
     for tweet in self.tweets:
         addition = " test_term" * counter
         tweet['body'] += addition
         analysis.analyze_tweet(tweet, results)
         counter += 1
     expected_test_count = int(
         next(results['body_term_count'].get_tokens())[0])
     self.assertEqual(expected_test_count, sum(range(counter)))
    def test_audience_length(self):
        pass
        # configure results structure
        results = analysis.setup_analysis(do_audience=True)
        # run analysis code (including audience) for user ids
        analysis.analyze_tweets(self.tweets, results)
        user_ids = results["tweets_per_user"].keys()

        # get ground truth (# unique user ids) from test data file
        p1 = subprocess.Popen(['cat', INPUT_FILE_NAME], stdout=subprocess.PIPE)
        p2 = subprocess.Popen([
            'python', '-c',
            'import sys; import json; print(len(set([json.loads(i)["actor"]["id"] for i in sys.stdin])))'
        ],
                              stdin=p1.stdout,
                              stdout=subprocess.PIPE)
        p1.stdout.close()
        out, err = p2.communicate()
        shell_user_count = int(out)
        self.assertEqual(shell_user_count, len(user_ids))
Exemplo n.º 10
0
    splitting_config = None
    if args.splitting_config is not None:
        # if file not in local directory, temporarily extend path to its location
        config_file_full_path = args.config_file.split('/')
        if len(config_file_full_path) > 1:
            path = '/'.join(config_file_full_path[:-1])
            sys.path.append(os.path.join(os.getcwd(), path))
        else:
            sys.path.append(os.getcwd())
        splitting_config = importlib.import_module(
            config_file_full_path[-1].rstrip('.py')).splitting_config
        sys.path.pop()

        results = analysis.setup_analysis(
            conversation=args.do_conversation_analysis,
            audience=args.do_audience_analysis,
            identifier='analyzed',
            input_results={})
        results = analysis.setup_analysis(
            conversation=args.do_conversation_analysis,
            audience=args.do_audience_analysis,
            identifier='baseline',
            input_results=results)
    else:
        results = analysis.setup_analysis(
            conversation=args.do_conversation_analysis,
            audience=args.do_audience_analysis)

    # manage input sources, file opening, and deserialization
    if args.input_file_name is not None:
        tweet_generator = analysis.deserialize_tweets(
    parser.add_argument('-o','--output-dir',dest='output_directory',default=os.environ['HOME'] + '/tweet_evaluation/',
            help='directory for output files; default is %(default)s')
    parser.add_argument('-b','--baseline-input-file',dest='baseline_input_name',default=None,
            help='Tweets against which to run a relative analysis')
    parser.add_argument('--no-insights',dest='use_insights',action='store_false',default=True)
    args = parser.parse_args()

    # get the time right now, to use in output naming
    time_now = datetime.datetime.now()
    output_directory = '{0}/{1:04d}/{2:02d}/{3:02d}/'.format(args.output_directory.rstrip('/')
            ,time_now.year
            ,time_now.month
            ,time_now.day
            )
    # get the empty results object, which defines the measurements to be run
    results = analysis.setup_analysis(do_conversation = args.do_conversation_analysis, do_audience = args.do_audience_analysis) 

    baseline_results = None
    if args.baseline_input_name is not None:
        baseline_results = analysis.setup_analysis(do_conversation = args.do_conversation_analysis, do_audience = args.do_audience_analysis)

    if not args.use_insights:
        results.pop('audience_api',None)
        if args.baseline_input_name is not None:
            baseline_results.pop('audience_api',None)
    
    # manage input sources, file opening, and deserialization
    if args.input_file_name is not None:
        tweet_generator = analysis.deserialize_tweets(open(args.input_file_name))
    else:
        tweet_generator = analysis.deserialize_tweets(sys.stdin)
            help="file containing tweets, tweet IDs, or user IDs; take input from stdin if not present") 
    parser.add_argument('-o','--output-dir',dest='output_directory',default=os.environ['HOME'] + '/tweet_evaluation/',
            help='directory for output files; default is %(default)s')
    args = parser.parse_args()

    # get the time right now, to use in output naming
    time_now = datetime.datetime.now()
    time_string = time_now.isoformat().split(".")[0].translate(None,":") 
    output_directory = '{0}/{1:04d}/{2:02d}/{3:02d}/'.format(args.output_directory.rstrip('/')
            ,time_now.year
            ,time_now.month
            ,time_now.day
            )
   
    # create the output directory if it doesn't exist
    ### 

    results = analysis.setup_analysis(conversation = args.do_conversation_analysis, audience = args.do_audience_analysis) 

    # manage input source
    if args.input_file_name is not None:
        input_generator = open(args.input_file_name)
    else:
        input_generator = sys.stdin

    # run analysis
    run_analysis(input_generator, results)

    # dump the output
    output.dump_results(results, output_directory, args.unique_identifier)
        help='directory for output files; default is %(default)s')
    parser.add_argument('-b',
                        '--baseline-input-file',
                        dest='baseline_input_name',
                        default=None,
                        help='Tweets against which to run a relative analysis')
    args = parser.parse_args()

    # get the time right now, to use in output naming
    time_now = datetime.datetime.now()
    output_directory = '{0}/{1:04d}/{2:02d}/{3:02d}/'.format(
        args.output_directory.rstrip('/'), time_now.year, time_now.month,
        time_now.day)
    # get the empty results object, which defines the measurements to be run
    results = analysis.setup_analysis(
        do_conversation=args.do_conversation_analysis,
        do_audience=args.do_audience_analysis)

    baseline_results = None
    if args.baseline_input_name is not None:
        baseline_results = analysis.setup_analysis(
            do_conversation=args.do_conversation_analysis,
            do_audience=args.do_audience_analysis)

    # manage input sources, file opening, and deserialization
    if args.input_file_name is not None:
        tweet_generator = analysis.deserialize_tweets(
            open(args.input_file_name))
    else:
        tweet_generator = analysis.deserialize_tweets(sys.stdin)