예제 #1
0
def train_lda():
	tfidf = models.TfidfModel.load(conf.tfidf)
	corpus_tfidf = tfidf[corpus.load_corpus()]
	lda = models.LdaModel(corpus_tfidf, id2word=corpora.Dictionary.load(conf.dictionary), num_topics=conf.num_topics)
	corpus_topics = lda[corpus_tfidf]
	objs = bytearray()
	for obj in corpus_topics:
		objs += pickle.dumps(obj)
	f = open(conf.corpus_topics, 'wb')
	f.write(objs)
	f.close()
	lda.save(conf.lda)
	return lda
예제 #2
0
def train_lda():
    tfidf = models.TfidfModel.load(conf.tfidf)
    corpus_tfidf = tfidf[corpus.load_corpus()]
    lda = models.LdaModel(corpus_tfidf,
                          id2word=corpora.Dictionary.load(conf.dictionary),
                          num_topics=conf.num_topics)
    corpus_topics = lda[corpus_tfidf]
    objs = bytearray()
    for obj in corpus_topics:
        objs += pickle.dumps(obj)
    f = open(conf.corpus_topics, 'wb')
    f.write(objs)
    f.close()
    lda.save(conf.lda)
    return lda
예제 #3
0
a = {

    "NN":"Common Noun",
    "NNP":"Proper Noun",
}
try:
    TEST = int(sys.argv[1])
except ValueError:
    print("Enter 0,1,2 to specify testing document")
    exit()
except IndexError:
    TEST = 2

    #def get_
from sklearn.tree import DecisionTreeClassifier
train = corpus.load_corpus(all=True)
statistic = analytics.load_analytics(train)
heighest_probabilty = {}

for i in statistic:
    heighest_probabilty[i] = max(statistic[i].items(),key=lambda x:x[1])[0]

X_train_raw, Y_train_raw = extract_feature(data=train)

#Global label_encoder to encode X values
global_label_encoder,global_hot_encoder = set_encoder(Y_train_raw)
print("Training Global Classifer ....")
X_train,Y_train = encode_features(X_train_raw,Y_train_raw,global_label_encoder,global_hot_encoder)
global_clf = DecisionTreeClassifier()
global_clf.fit(X_train,Y_train)
print("Completed")
예제 #4
0
def tsne(docs, target, outpath, **kwargs):
    # Create a new figure and axes
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # Visualize the frequency distribution
    visualizer = TSNEVisualizer(ax=ax, **kwargs)
    visualizer.fit(docs, target)
    visualizer.poof(outpath=outpath)


if __name__ == '__main__':

    # Load and vectorize the corpus
    corpus = load_corpus("../../../examples/data/hobbies")
    tfidf = TfidfVectorizer()

    docs = tfidf.fit_transform(corpus.data)
    target = corpus.target

    # Whole corpus visualization
    tsne(docs, target, "images/tsne_all_docs.png")

    # No labels
    tsne(docs, None, "images/tsne_no_labels.png", labels=["documents"])

    # Apply clustering instead of class names.
    clusters = KMeans(n_clusters=5)
    clusters.fit(docs)
예제 #5
0
from sklearn.model_selection import KFold, StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

from six.moves import xrange, input  # pylint: disable=redefined-builtin
from six import text_type
from os.path import join

from utils import plot_confusion_matrix, label_classification_report, print_cm
from corpus import load_corpus
from feats import sent2features, sent2labels, sent2tokens, pos_feats, pos_word_feats, crf_feats
import pickle

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

label_set, train_set, test_set = load_corpus()

MODEL_ROOT = './models/'

print "\nExtracting word features ..."

# uncomment below lines to use these features but they perform inferior to the ones used in demo
# WORD FEATS
#train_featuresets = [(word_feats(sample[0]), sample[1]) for sample in train_set]
#test_featuresets = [(word_feats(sample[0]), sample[1]) for sample in test_set]

# POS FEATS
#train_featuresets = [(pos_feats(sample[0]), sample[1]) for sample in train_set]
#test_featuresets = [(pos_feats(sample[0]), sample[1]) for sample in test_set]

# WORD+POS FEATS
from  features import extract_feature, set_encoder,encode_features
from  corpus import  load_corpus

from sklearn.tree import  DecisionTreeClassifier

X_train_raw, Y_train_raw = extract_feature(data=load_corpus())

label_encoder,hot_encoder = set_encoder(Y_train_raw)

X_train,Y_train = encode_features(X_train_raw,Y_train_raw,label_encoder,hot_encoder)

clf = DecisionTreeClassifier()
clf.fit(X_train,Y_train)

X_test_raw,Y_test_raw = extract_feature(load_corpus(last=True))
X_test,Y_test = encode_features(X_test_raw,Y_test_raw,label_encoder,hot_encoder)
print(clf.score(X_test,Y_test))
예제 #7
0
# A simple probablistic tagger
# The goal is to beat this accuracy using ML
import analytics, corpus
statistic = analytics.load_analytics()
heighest_probabilty = {}
for i in statistic:
    heighest_probabilty[i] = max(statistic[i].items(), key=lambda x: x[1])[0]
test = corpus.load_corpus(last=True)
test_dict = []
for i in test:
    for j in i:
        test_dict.append(j)

hit = 0
miss = 0
unknwon = 0
ambiguity_miss = 0
unknwon_ambiguity = 0
a = 1
for i in test_dict:
    try:
        if heighest_probabilty[i[0]] == i[1]:
            hit += 1
        else:
            if i[1] in statistic[i[0]]:
                ambiguity_miss += 1
                miss += 1
            else:
                if len(statistic[i[0]].keys()) == 1:
                    a += 1
                    print("ambigity:", statistic[i[0]], i)
예제 #8
0
def init_tfidf():
    tfidf = models.TfidfModel(corpus.load_corpus())
    tfidf.save(conf.tfidf)
    return tfidf
예제 #9
0
# -*- coding: utf-8 -*-

import os
import random
import math
from numpy import random as nprand
import copy as cp
import corpus as cputil

corpus = cputil.load_corpus("corpus-formatted.csv")

cc = [([w for w in doc if len(w)>1 and not w.isdigit() and not w.lower().islower()],ul) for doc,ul in corpus]
corpus = [d for d in cc if len(d[0])>43]


class TopicOverTime:
    def __init__(self,corpus,alpha=0.1,beta=0.01,gamma=0.01,C=20,n_iter=300):
        self.corpus = corpus
        self.corpus_user = [[u for t,u in ul] for _,ul in self.corpus]
        self.corpus_timestamp = [[t for t,u in ul] for _,ul in self.corpus]
        self.M = len(self.corpus)
        self.NU = sum(map(len,self.corpus_user))
        
        self.udic = list(set([u for d in self.corpus_user for u in d]))
        self.usize = len(self.udic)
        self.C = C
        self.alpha = alpha
        self.gamma = gamma
        self.n_iter = n_iter
        self.communities = [nprand.randint(0,self.C,size=l) for l in map(len,self.corpus_user)]
예제 #10
0
        return self.word_list

    def set_clf(self, clf):
        self.clf = clf

    def get_clf(self):
        return self.clf

    def __str__(self):
        return "".join(self.word_list)

    #def get_


from sklearn.tree import DecisionTreeClassifier
train = corpus.load_corpus(test=TEST)
statistic = analytics.load_analytics(train)
heighest_probabilty = {}

for i in statistic:
    heighest_probabilty[i] = max(statistic[i].items(), key=lambda x: x[1])[0]

X_train_raw, Y_train_raw = extract_feature(data=train)

#Global label_encoder to encode X values
global_label_encoder, global_hot_encoder = set_encoder(Y_train_raw)
print("Training Global Classifer ....")
X_train, Y_train = encode_features(X_train_raw, Y_train_raw,
                                   global_label_encoder, global_hot_encoder)
global_clf = DecisionTreeClassifier()
global_clf.fit(X_train, Y_train)
예제 #11
0
# -*- coding: utf-8 -*-

import os
import random
import math
from numpy import random as nprand
import copy as cp
import corpus as cputil

corpus = cputil.load_corpus("corpus-formatted.csv")

cc = [([
    w for w in doc
    if len(w) > 1 and not w.isdigit() and not w.lower().islower()
], ul) for doc, ul in corpus]
corpus = [d for d in cc if len(d[0]) > 43]


class TopicOverTime:
    def __init__(self,
                 corpus,
                 alpha=0.1,
                 beta=0.01,
                 gamma=0.01,
                 C=20,
                 n_iter=300):
        self.corpus = corpus
        self.corpus_user = [[u for t, u in ul] for _, ul in self.corpus]
        self.corpus_timestamp = [[t for t, u in ul] for _, ul in self.corpus]
        self.M = len(self.corpus)
        self.NU = sum(map(len, self.corpus_user))
예제 #12
0
def init_tfidf():
	tfidf = models.TfidfModel(corpus.load_corpus())
	tfidf.save(conf.tfidf)
	return tfidf
예제 #13
0
# -*- coding: utf-8 -*-

import os
import random
import math
from numpy import random as nprand
import copy as cp
import corpus as cputil

corpus = cputil.load_corpus("corpus_filtered.final")
corpus = filter(lambda c:len(c[0])>48,corpus)


class UserLDA:
    def __init__(self,corpus,alpha=0.1,beta=0.01,gamma=0.01,K=20,n_iter=300):
        self.corpus = corpus
        self.corpus_word = [w for w,_ in self.corpus]
        self.corpus_user = [u for _,u in self.corpus]
        self.M = len(self.corpus)
        self.NW = sum(map(len,self.corpus_word))
        self.NU = sum(map(len,self.corpus_user))
        
        self.wdic = list(set([w for d in self.corpus_word for w in d]))
        self.wsize = len(self.wdic)
        self.udic = list(set([u for d in self.corpus_user for u in d]))
        self.usize = len(self.udic)
        self.K = K
        self.C = K
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma