Exemplo n.º 1
0
 def load_classifier(self):
     """
     Gets the classifier when it is first required.
     """
     if not hasattr(self, 'classifier'):
         manager = RedisManager()
         self.classifier = manager.pickle_load(self.classifier_type)
Exemplo n.º 2
0
 def load_classifier(self):
     """
     Gets the classifier when it is first required.
     """
     if not hasattr(self, 'classifier'):
         manager = RedisManager()
         self.classifier = manager.pickle_load(self.classifier_type)
Exemplo n.º 3
0
def get_sample_limit(db):
    """
    Returns the limit of samples so that both positive and negative samples
    will remain balanced.

    Keyword Arguments:
    db (str) -- Name of the database to use.

    """

    #this is an expensive operation in case of a large database
    #therefore we store the limit in redis and use that when we can
    m = RedisManager()
    if 'limit' in m.r.keys():
        return int(m.r.get('limit'))

    db = db_init(db=db)
    cursor = db.cursor()
    cursor.execute("SELECT COUNT(*) FROM item where sentiment = 'positive'")
    pos_count = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(*) FROM item where sentiment = 'negative'")
    neg_count = cursor.fetchone()[0]
    if neg_count > pos_count:
        limit = pos_count
    else:
        limit = neg_count

    m.r.set('limit', limit)

    return limit
Exemplo n.º 4
0
def redis_feature_consumer(samples, **kwargs):
    """
    Stores feature and counts to redis via a pipeline.
    """

    rm = RedisManager()
    pipeline = rm.r.pipeline()

    neg_processed, pos_processed = 0, 0

    for text, label in samples:

        count_label = label + '_feature_counts'

        tokens = normalize_text(text)

        if tokens:
            if label.startswith('pos'):
                pos_processed += 1
            else:
                neg_processed += 1

            for word in set(tokens):  #make sure we only add word once
                pipeline.zincrby(count_label, word)

    pipeline.incr('negative_processed', neg_processed)
    pipeline.incr('positive_processed', pos_processed)

    pipeline.execute()
Exemplo n.º 5
0
 def _get_classifier(self):
     """
     Gets the classifier when it is first required.
     """
     logger.debug("Retrieving classifier ...")
     self._classifier = RedisManager(db=self.redis_db).pickle_load(
         self.classifier_type)
Exemplo n.º 6
0
 def __init__(self, best_words=None):
     if best_words:
         self.best_words = best_words
     else:
         self.best_words = RedisManager().get_best_features()
Exemplo n.º 7
0
def test_accuracy(db_name='',
                  test_samples=0,
                  neutral_range=0,
                  offset=0,
                  redis_db=5):
    """
    Returns two accuracies and classifier:
    NLTK accuracy is the internal accuracy of the classifier
    Manual Accuracy is the accuracy when compared to pre-flagged/known samples and label.

    Keyword Arguments:
    db_name (str) -- Samples database to use, by default this is the same as your trained database
                     with an offset to ensure unseen data. Should be a string database name located in ~/.synt.

    test_samples (int) -- Amount of samples to use, by default this will be 25% of the training set amount.

    neutral_range (float) -- Will be used to drop "neutrals" to see how real-world accuracy will look.
                             For example in the case where neutral range is 0.2 if the sentiment
                             guessed is not greater than 0.2 or less than -0.2 it is not considered.
                             Leaving this set to 0 will not cause the special case drops and will by default
                             categorize text as either positive or negative. This may be undesired as the classifier
                             will treat 0.0001 as positive even though it is not a strong indication.

    offset (int) -- By default the offset is decided from the end of the the trained amount, i.e
                    if you've trained on 1000 and you have 250 testing samples the samples retrieved
                    will be from 1000-1250, you can override this offset if you wish to use a different
                    subset.

    redis_db (int) -- The redis database to use.
    """

    m = RedisManager(db=redis_db)
    trained_classifier = m.r.get(
        'trained_classifier')  #retrieve the trained classifier

    if not trained_classifier:
        print("Accuracy needs a classifier, have you trained?")
        return

    classifier = m.pickle_load(trained_classifier)

    #we want to make sure we are testing on a new set of samples therefore
    #we use the trained_to as our offset and proceed to use the samples
    #thereafter, unless an offset is otherwise specified
    trained_to = int(m.r.get('trained_to'))

    if not offset:
        offset = trained_to

    if test_samples <= 0:  #if no testing samples provided use 25% of our training number
        test_samples = int(trained_to * .25)

    if not db_name:
        db_name = m.r.get('trained_db')  #use the trained samples database

    test_samples = get_samples(db_name,
                               test_samples,
                               offset=offset,
                               redis_db=redis_db)

    testfeats = []
    trained_ext = m.r.get('trained_extractor')

    feat_ex = get_extractor(trained_ext)()

    #normalization and extraction
    for text, label in test_samples:
        tokens = normalize_text(text)
        bag_of_words = feat_ex.extract(tokens)

        if bag_of_words:
            testfeats.append((bag_of_words, label))

    nltk_accuracy = nltk.classify.util.accuracy(
        classifier, gold=testfeats) * 100  # percentify

    total_guessed = 0
    total_correct = 0
    total_incorrect = 0

    g = Guesser(extractor_type=trained_ext)

    #compare the guessed sentiments with our samples database to determine manual accuracy
    for text, label in test_samples:
        guessed = g.guess(text)
        if abs(guessed) < neutral_range:
            continue

        if (guessed > 0) == label.startswith('pos'):
            total_correct += 1
        else:
            #print text, label, guessed
            total_incorrect += 1

        total_guessed += 1

    assert total_guessed, "There were no guesses, make sure you've trained on the same database you're testing."

    manual_accuracy = total_correct * 100.0 / total_guessed

    #TODO: precision and recall

    return (nltk_accuracy, manual_accuracy, classifier)
Exemplo n.º 8
0
def test_accuracy(db_name='', test_samples=0, neutral_range=0, offset=0, redis_db=5):
    """
    Returns two accuracies and classifier:
    NLTK accuracy is the internal accuracy of the classifier
    Manual Accuracy is the accuracy when compared to pre-flagged/known samples and label.

    Keyword Arguments:
    db_name (str) -- Samples database to use, by default this is the same as your trained database
                     with an offset to ensure unseen data. Should be a string database name located in ~/.synt.

    test_samples (int) -- Amount of samples to use, by default this will be 25% of the training set amount.

    neutral_range (float) -- Will be used to drop "neutrals" to see how real-world accuracy will look.
                             For example in the case where neutral range is 0.2 if the sentiment
                             guessed is not greater than 0.2 or less than -0.2 it is not considered.
                             Leaving this set to 0 will not cause the special case drops and will by default
                             categorize text as either positive or negative. This may be undesired as the classifier
                             will treat 0.0001 as positive even though it is not a strong indication.

    offset (int) -- By default the offset is decided from the end of the the trained amount, i.e
                    if you've trained on 1000 and you have 250 testing samples the samples retrieved
                    will be from 1000-1250, you can override this offset if you wish to use a different
                    subset.

    redis_db (int) -- The redis database to use.
    """

    m = RedisManager(db=redis_db)
    trained_classifier = m.r.get('trained_classifier') #retrieve the trained classifier

    if not trained_classifier:
        print("Accuracy needs a classifier, have you trained?")
        return

    classifier = m.pickle_load(trained_classifier)

    #we want to make sure we are testing on a new set of samples therefore
    #we use the trained_to as our offset and proceed to use the samples
    #thereafter, unless an offset is otherwise specified
    trained_to = int(m.r.get('trained_to'))

    if not offset:
        offset = trained_to

    if test_samples <= 0: #if no testing samples provided use 25% of our training number
        test_samples = int(trained_to * .25)

    if not db_name:
        db_name = m.r.get('trained_db') #use the trained samples database

    test_samples = get_samples(db_name, test_samples, offset=offset,
        redis_db=redis_db)

    testfeats = []
    trained_ext = m.r.get('trained_extractor')

    feat_ex = get_extractor(trained_ext)()

    #normalization and extraction
    for text, label in test_samples:
        tokens = normalize_text(text)
        bag_of_words = feat_ex.extract(tokens)

        if bag_of_words:
            testfeats.append((bag_of_words, label))

    nltk_accuracy = nltk.classify.util.accuracy(classifier, gold=testfeats) * 100 # percentify

    total_guessed = 0
    total_correct = 0
    total_incorrect = 0

    g = Guesser(extractor_type=trained_ext)

    #compare the guessed sentiments with our samples database to determine manual accuracy
    for text, label in test_samples:
        guessed = g.guess(text)
        if abs(guessed) < neutral_range:
            continue

        if (guessed > 0) == label.startswith('pos'):
            total_correct += 1
        else:
            #print text, label, guessed
            total_incorrect += 1

        total_guessed += 1

    assert total_guessed, "There were no guesses, make sure you've trained on the same database you're testing."

    manual_accuracy =  total_correct * 100.0 / total_guessed

    #TODO: precision and recall

    return (nltk_accuracy, manual_accuracy, classifier)
Exemplo n.º 9
0
def train(db_name, samples=200000, classifier_type='naivebayes', extractor_type='words',
    best_features=10000, processes=8, purge=False):
    """
    Train with samples from sqlite database and stores the resulting classifier in Redis.

    Arguments:
    db_name (str) -- Name of the training database to use stored in ~/.synt

    Keyword arguments:
    samples (int) -- Amount of samples to train on.
    classifier_type (str) -- Type of classifier to use. Available classifiers are 'naivebayes'.
    extractor_type (str) -- Type of extractor to use. Available extractors are 'words', 'stopwords', 'bestwords'.
    best_features (int) -- Amount of highly informative features to store.
    processes (int) -- The amount of processes to be used for counting features in parallel.
    purge (bool) -- If true will flush the redis database.
    """
    m = RedisManager(purge=purge)

    extractor = get_extractor(extractor_type)

    if not db_exists(db_name):
        raise ValueError("Database '%s' does not exist." % db_name)

    if classifier_type in m.r.keys():
        print("Classifier exists in Redis. Purge to re-train.")
        return

    classifier = config.CLASSIFIERS.get(classifier_type)
    if not classifier: #classifier not supported
        raise ValueError("Classifier '%s' not supported." % classifier_type)

    #retrieve training samples from database
    train_samples = get_samples(db_name, samples)

    m.store_feature_counts(train_samples, processes=processes)
    m.store_feature_scores()

    if best_features and best_features > 1:
        m.store_best_features(best_features)

    label_freqdist = FreqDist()
    feature_freqdist = defaultdict(FreqDist)

    #retreieve the actual samples processed for label
    neg_processed, pos_processed = m.r.get('negative_processed'), m.r.get('positive_processed')
    label_freqdist.inc('negative', int(neg_processed))
    label_freqdist.inc('positive', int(pos_processed))

    labeled_feature_freqs = m.pickle_load('labeled_feature_freqs')
    labels = labeled_feature_freqs.keys()

    #feature extraction
    feat_ex = extractor()
    extracted_set = set([feat_ex.extract(labeled_feature_freqs[label].keys(), as_list=True) for label in labels][0])

    #increment the amount of times a given feature for label occured and fill in the missing occurences with Falses
    for label in labels:
        samples = label_freqdist[label]
        for fname in extracted_set:
            trues = labeled_feature_freqs[label].get(fname, 0)
            falses = samples - trues
            feature_freqdist[label, fname].inc(True, trues)
            feature_freqdist[label, fname].inc(False, falses)

    #create the P(label) distribution
    estimator = ELEProbDist
    label_probdist = estimator(label_freqdist)

    #create the P(fval|label, fname) distribution
    feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
        probdist = estimator(freqdist, bins=2)
        feature_probdist[label,fname] = probdist

    #TODO: naivebayes supports this prototype, future classifiers will most likely not
    trained_classifier = classifier(label_probdist, feature_probdist)

    m.pickle_store(classifier_type, trained_classifier)
    m.r.set('trained_to', samples)
    m.r.set('trained_db', db_name)
    m.r.set('trained_classifier', classifier_type)
    m.r.set('trained_extractor', extractor_type)
Exemplo n.º 10
0
def train(db_name,
          samples=200000,
          classifier_type='naivebayes',
          extractor_type='words',
          best_features=10000,
          processes=8,
          purge=False):
    """
    Train with samples from sqlite database and stores the resulting classifier in Redis.

    Arguments:
    db_name (str) -- Name of the training database to use stored in ~/.synt

    Keyword arguments:
    samples (int) -- Amount of samples to train on.
    classifier_type (str) -- Type of classifier to use. Available classifiers are 'naivebayes'.
    extractor_type (str) -- Type of extractor to use. Available extractors are 'words', 'stopwords', 'bestwords'.
    best_features (int) -- Amount of highly informative features to store.
    processes (int) -- The amount of processes to be used for counting features in parallel.
    purge (bool) -- If true will flush the redis database.
    """
    m = RedisManager(purge=purge)

    extractor = get_extractor(extractor_type)

    if not db_exists(db_name):
        raise ValueError("Database '%s' does not exist." % db_name)

    if classifier_type in m.r.keys():
        print("Classifier exists in Redis. Purge to re-train.")
        return

    classifier = config.CLASSIFIERS.get(classifier_type)
    if not classifier:  #classifier not supported
        raise ValueError("Classifier '%s' not supported." % classifier_type)

    #retrieve training samples from database
    train_samples = get_samples(db_name, samples)

    m.store_feature_counts(train_samples, processes=processes)
    m.store_feature_scores()

    if best_features and best_features > 1:
        m.store_best_features(best_features)

    label_freqdist = FreqDist()
    feature_freqdist = defaultdict(FreqDist)

    #retreieve the actual samples processed for label
    neg_processed, pos_processed = m.r.get('negative_processed'), m.r.get(
        'positive_processed')
    label_freqdist.inc('negative', int(neg_processed))
    label_freqdist.inc('positive', int(pos_processed))

    labeled_feature_freqs = m.pickle_load('labeled_feature_freqs')
    labels = labeled_feature_freqs.keys()

    #feature extraction
    feat_ex = extractor()
    extracted_set = set([
        feat_ex.extract(labeled_feature_freqs[label].keys(), as_list=True)
        for label in labels
    ][0])

    #increment the amount of times a given feature for label occured and fill in the missing occurences with Falses
    for label in labels:
        samples = label_freqdist[label]
        for fname in extracted_set:
            trues = labeled_feature_freqs[label].get(fname, 0)
            falses = samples - trues
            feature_freqdist[label, fname].inc(True, trues)
            feature_freqdist[label, fname].inc(False, falses)

    #create the P(label) distribution
    estimator = ELEProbDist
    label_probdist = estimator(label_freqdist)

    #create the P(fval|label, fname) distribution
    feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
        probdist = estimator(freqdist, bins=2)
        feature_probdist[label, fname] = probdist

    #TODO: naivebayes supports this prototype, future classifiers will most likely not
    trained_classifier = classifier(label_probdist, feature_probdist)

    m.pickle_store(classifier_type, trained_classifier)
    m.r.set('trained_to', samples)
    m.r.set('trained_db', db_name)
    m.r.set('trained_classifier', classifier_type)
    m.r.set('trained_extractor', extractor_type)