Exemplo n.º 1
0
 def test_guess_unicode(self):
     score = guess("FOE JAPANが粘り強く主張していた避難の権利", classifier=self.man.load_classifier())
     self.assertEqual(type(score), float)
     self.assertTrue(-1.0 <= score <= 1.0)
Exemplo n.º 2
0
 def test_guess_no_text(self):
     score = guess("", classifier=self.man.load_classifier())
     self.assertEqual(type(score), float)
     self.assertEqual(score, 0.0)
Exemplo n.º 3
0
def test(test_samples=200000, feat_ex=best_word_feats):
    """
    This first returns the accuracy of the classifier then proceeds
    to test across known sentiments and produces a 'manual accuracy score'.
    
    Keyword Arguments:
    test_samples    -- the amount of samples to test against
    feat_ext        -- the feature extractor to use (utils/extractors)
    
    """

    classifier = RedisManager().load_classifier()
    
    if not classifier:
        print("There is not classifier in Redis yet, have you trained?")
        return

    results = []
    nltk_testing_dicts = []
    accurate_samples = 0
    
    print("Preparing %s Testing Samples" % test_samples)
    samples = get_samples(test_samples)
    
    for sample in samples:
        
        text, sentiment = sample[0], sample[1] #(text, sentiment)
        tokens = sanitize_text(text)
        
        if tokens:
            feats = feat_ex(tokens)
            
            nltk_testing_dicts.append((feats, sentiment))

    nltk_accuracy = nltk.classify.util.accuracy(classifier, nltk_testing_dicts)  * 100 # percentify
    
    for sample in samples:
        text, sentiment = sample[0], sample[1] #(text, sentiment)
        guessed = guess(text)
       
        if sentiment.startswith('pos') and guessed > 0:
            accurate = True
        elif sentiment.startswith('neg') and guessed < 0:
            accurate = True
        else:
            accurate = False
            
        
        results.append((accurate, sentiment, guessed, text))
    
    for result in results:
        print ("Text: %s" % (result[3]))
        print ("Accuracy: %s | Known Sentiment: %s | Guessed Sentiment: %s " % (result[0], result[1], result[2]))
        print ("------------------------------------------------------------------------------------------------------------------------------------------")
        
        if result[0] == True:
            accurate_samples += 1
       

        total_accuracy = (accurate_samples * 100.00 / len(samples)) 
    
    classifier.show_most_informative_features(30)
    print("\n\rManual classifier accuracy result: %s%%" % total_accuracy)
    print("\n\rNLTK classifier accuracy result: %.2f%%" % nltk_accuracy)
Exemplo n.º 4
0
 def test_guess_with_text(self):
     score = guess("some random text", classifier=self.man.load_classifier())
     self.assertEqual(type(score), float)
     self.assertTrue(-1.0 <= score <= 1.0)
Exemplo n.º 5
0
def main():

    parser = argparse.ArgumentParser(description='Tool to interface with synt, provides a way to train, collect and guess from the command line.')

    subparsers = parser.add_subparsers(help='sub-command help', dest='parser')

    #train command
    parser_train = subparsers.add_parser('train', help='Train a classifer')
    parser_train.add_argument(
            '--train_samples',
            action='store',
            type=int,
            default=2000,
            help="""The amount of samples to train on."""
    )

    parser_train.add_argument(
            '--wc_samples',
            action='store',
            type=int,
            default=2000,
            help="""We store a word:count mapping to determine a list of useful and popular words to use.
            This is the the number of samples to generate our words from. Generally you want this number to
            be pretty high as it will gradually reduce variations and produce a consistent set of useful words."""
    )

    parser_train.add_argument(
            '--wc_range',
            action='store',
            type=int,
            default=2000,
            help="""This is the actual amount of words to use to build freqDists. By this point (depending on how many word samples used) you will have a lot of tokens. Most of these tokens are uninformative and produce nothing but noise. This is the first layer of cutting down that batch to something reasonable. The number provided will use words
            from 0 .. wc_range. Words are already sorted by most frequent to least."""

    )
    parser_train.add_argument(
        '--fresh',
        action='store',
        type=int,
        default=False,
        help="""If True this will force a new train, useful to test various sample, wordcount combinations. 1 = True 0 = False"""
    )

    parser_train.add_argument(
        '--verbose',
        action='store',
        type=int,
        default=True,
        help="""Displays log info to stdout by default. 1 = True 0 = False"""
    )

    #collect command
    parser_collect = subparsers.add_parser('collect', help='Collect sample data.')
    parser_collect.add_argument('fetch', help='Grab the sample_database')
    parser_collect.add_argument('--time', action='store', type=int, default=500)

    #guess command
    parser_guess = subparsers.add_parser(
        'guess',
        description="Guess' sentiment. This relies on a trained classifier to exist in the database which means you should run 'train' before attempting to guess. The output is a float between -1 and 1 detailing how negative or positive the sentiment is. Anything close to 0 should be treated as relativley neutral.",
    )
    parser_guess.add_argument(
            'text',
            action='store',
            help = 'Text to guess on.'
    )

    #tester commmand
    parser_tester = subparsers.add_parser(
        'test',
        description = """Runs the tester test function to test accuracy. You can provide a number of samples by --samples [num]""" 
    )
    
    parser_tester.add_argument(
        '--samples',
        action='store',
        type=int,
        help='Tests the accuracy with number of samples as test samples.',
    )

    args = parser.parse_args()

    if args.parser == 'train':
        train(
            train_samples=args.train_samples,
            wordcount_samples=args.wc_samples,
            wordcount_range=args.wc_range,
            verbose=args.verbose,
            force_update=args.fresh,
    )

    if args.parser == 'collect':
        if args.fetch:
            fetch()
        else:
            collect()
        pass

    if args.parser == 'guess':
        text = args.text.strip()
        print(guess(text=text))

    if args.parser == 'test':
        if args.samples:
            test(test_samples=args.samples)
        else: #runs with default test_samples
            test()