Пример #1
0
def test(test_file,
         classifier,
         word_frequency_in_class,
         total_words_in_category,
         categories,
         filter_):
    """Uzima test fajl i za svaki red radi tokenizaciju, zatim provlaci listu kroz NB klasifikator,
    izlaz je kategorija tweeta, uporedjujemo je sa stvarnom kategorijom tweeta (nalazi se u nekoj koloni testa),
    vraca
    """
    total_test_tweets = 0
    correct_tweets = 0
    conf = defaultdict(Counter)
    test_data = Read.load_data(test_file)

    for line in test_data:
        total_test_tweets += 1
        sent = classifier(
            categories,
            filter_(line[5]),
            word_frequency_in_class,
            total_words_in_category
        )
        if sent == line[0]:
            correct_tweets += 1
        conf[line[0]][sent] += 1
    # print (conf)
    print ('********* Ukupan broj tweetova u testu je {0}, broj tacnih odgovora je {1}"'.format(total_test_tweets,
                                                                                                correct_tweets))
    print ('********* Procenat tacnosti je {0} '.format(correct_tweets / float(total_test_tweets)))
Пример #2
0
items = db.my_collection

try:
    import cPickle as pickle
except ImportError:
    import pickle

# GLOBAL VARIABLES
# --------------------------------------------------------------------

# Current directory

CWD = os.path.dirname(os.path.realpath(__file__))

test_data = CWD + "/trainingandtestdata/testdata.csv"
tr_data = Read.load_data(CWD + "/trainingandtestdata/training.1600000_posneg.csv")

stop_words = set()

with open(CWD + '/english.stop', 'rb') as stop_w:
    for word in stop_w:
        stop_words.add(word.decode('utf-8').rstrip())

tweet_filter = Filter(
    ngram_combo=[1, 2, 3],
    stop_words=stop_words,
    patterns=REGEX_PATTERNS,
    func=stemmatize
)

START_TIME = time.time()