Python sanitize_text 예제들, synt.utils.text.sanitize_text Python 예제들

예제 #1

0

파일 보기

파일: guesser.py 프로젝트: daniel-cloudspace/synt

def guess(text, classifier=DEFAULT_CLASSIFIER, feat_ex=best_word_feats):
    """
    Takes a blob of text and returns the sentiment score (-1.0 - 1.0).
    
    Keyword Arguments:
    classifier      -- the classifier to use  (Note: for now we only have a naivebayes classifier)
    feat_ex         -- the feature extractor to use i.e bigram_word_feats, stopword_feats, found in extractors
    """

    assert classifier, "Needs a classifier."
    
    tokens = sanitize_text(text)
    
    bag_of_words = feat_ex(tokens)
   
    score = 0.0
    
    if bag_of_words:
        
        prob = classifier.prob_classify(bag_of_words)
        
        #return a -1 .. 1 score
        score = prob.prob('positive') - prob.prob('negative')
       
        #if score doesn't fall within -1 and 1 return 0.0 
        #example: single words might return a heavily biased score like -9.8343
        if not (-1 <= score <= 1):
            pass #score 0.0

    return score

예제 #2

0

파일 보기

파일: redis_manager.py 프로젝트: daniel-cloudspace/synt

    def store_word_counts(self, wordcount_samples=300000):
        """
        Stores word:count histograms for samples in Redis with the ability to increment.
        """

        if 'positive_wordcounts' and 'negative_wordcounts' in self.r.keys():
            return
        from synt.utils.db import get_samples
       
        samples = get_samples(wordcount_samples)
        assert samples, "Samples must be provided."

        for text, label in samples:
            label = label + '_wordcounts'
            tokens = sanitize_text(text)

            if tokens:
                for word in tokens:
                    prev_score = self.r.zscore(label, word)
                    self.r.zadd(label, word, 1 if not prev_score else prev_score + 1)

예제 #3

0

파일 보기

파일: tester.py 프로젝트: daniel-cloudspace/synt

def test(test_samples=200000, feat_ex=best_word_feats):
    """
    This first returns the accuracy of the classifier then proceeds
    to test across known sentiments and produces a 'manual accuracy score'.
    
    Keyword Arguments:
    test_samples    -- the amount of samples to test against
    feat_ext        -- the feature extractor to use (utils/extractors)
    
    """

    classifier = RedisManager().load_classifier()
    
    if not classifier:
        print("There is not classifier in Redis yet, have you trained?")
        return

    results = []
    nltk_testing_dicts = []
    accurate_samples = 0
    
    print("Preparing %s Testing Samples" % test_samples)
    samples = get_samples(test_samples)
    
    for sample in samples:
        
        text, sentiment = sample[0], sample[1] #(text, sentiment)
        tokens = sanitize_text(text)
        
        if tokens:
            feats = feat_ex(tokens)
            
            nltk_testing_dicts.append((feats, sentiment))

    nltk_accuracy = nltk.classify.util.accuracy(classifier, nltk_testing_dicts)  * 100 # percentify
    
    for sample in samples:
        text, sentiment = sample[0], sample[1] #(text, sentiment)
        guessed = guess(text)
       
        if sentiment.startswith('pos') and guessed > 0:
            accurate = True
        elif sentiment.startswith('neg') and guessed < 0:
            accurate = True
        else:
            accurate = False
            
        
        results.append((accurate, sentiment, guessed, text))
    
    for result in results:
        print ("Text: %s" % (result[3]))
        print ("Accuracy: %s | Known Sentiment: %s | Guessed Sentiment: %s " % (result[0], result[1], result[2]))
        print ("------------------------------------------------------------------------------------------------------------------------------------------")
        
        if result[0] == True:
            accurate_samples += 1
       

        total_accuracy = (accurate_samples * 100.00 / len(samples)) 
    
    classifier.show_most_informative_features(30)
    print("\n\rManual classifier accuracy result: %s%%" % total_accuracy)
    print("\n\rNLTK classifier accuracy result: %.2f%%" % nltk_accuracy)

예제 #4

0

파일 보기

파일: trainer.py 프로젝트: daniel-cloudspace/synt

def train(
    feat_ex=best_word_feats,
    train_samples=400000,
    wordcount_samples=300000,
    wordcount_range=150000,
    force_update=False,
    verbose=True,
):
    """
    Trains a Naive Bayes classifier with samples from database and stores the 
    resulting classifier in Redis.
  
    Args:
    featx             -- the feature extractor to use, found in utils/extractors.py

    Keyword arguments:
    train_samples     -- the amount of samples to train half this number will be negative the other positive 
    wordcount_samples -- the amount of samples to build wordcounts, this produces a word:count histogram in Redis 
    wordcount_range   -- the amount of 'up-to' words to use for the FreqDist will pick out the most
                         'popular' words up to this amount. i.e top 150000 tokens 
    force_update      -- if True will drop the Redis DB and assume a new train 
    verbose           -- if True will output to console
    """

    logger = create_logger(__file__)
    if not verbose:  # no output
        logger.setLevel(0)

    man = RedisManager(force_update=force_update)

    if "classifier" in man.r.keys():
        logger.info("Trained classifier exists in Redis.")
        return

    logger.info("Storing %d word counts." % wordcount_samples)
    man.store_word_counts(wordcount_samples)
    logger.info("Build frequency distributions with %d words." % wordcount_range)
    man.build_freqdists(wordcount_range)
    logger.info("Storing word scores.")
    man.store_word_scores()
    logger.info("Storing best words.")
    man.store_best_words()

    samples = get_samples(train_samples)

    half = len(samples) / 2

    pos_samples = samples[:half]
    neg_samples = samples[half:]

    logger.info("Build negfeats and posfeats")
    negfeats, posfeats = [], []

    for text, sent in neg_samples:
        s_text = sanitize_text(text)
        tokens = feat_ex(s_text)

        if tokens:
            negfeats.append((tokens, sent))

    for text, sent in pos_samples:
        s_text = sanitize_text(text)
        tokens = feat_ex(s_text)

        if tokens:
            posfeats.append((tokens, sent))

    if not (negfeats or posfeats):
        logger.error("Could not build positive and negative features.")
        return

    negcutoff = len(negfeats) * 3 / 4  # 3/4 training set
    poscutoff = len(posfeats) * 3 / 4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    logger.info("Train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))

    classifier = NaiveBayesClassifier.train(trainfeats)
    logger.info("Done training")

    man.store_classifier(classifier)
    logger.info("Stored to Redis")

    #   refsets = collections.defaultdict(set)
    #   testsets = collections.defaultdict(set)

    #   for i, (feats, label) in enumerate(testfeats):
    #       if feats:
    #           refsets[label].add(i)
    #           observed = classifier.classify(feats)
    #           testsets[observed].add(i)
    #
    #   print '#### POSITIVE ####'
    #   print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    #   print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    #   print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
    #   print
    #   print '#### NEGATIVE ####'
    #   print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    #   print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    #   print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])

    # print '--------------------'
    logger.info("Classifier Accuracy: %s" % util.accuracy(classifier, testfeats))