def savefeatures(classifier, filename): """Accepts feature probability distributions, filename string. Saves features to the file in json format. Returns features list of 6-tuples. First is name of feature, second is predictive ratio, third is probability of showing up given positive label, 4th is ... negative. 5th is the label of maximum P(label|feature). 6th is P(label|feature) for that feature. """ features = [] pd = classifier._feature_probdist for (label, fname) in pd: if label == "neg": ratio = maxratio(pd, fname) posprob = p_of_feature_given_label(pd, "pos", fname) negprob = p_of_feature_given_label(pd, "neg", fname) ld = classifier.prob_classify({fname: True}) max_class = ld.max() pclass = ld.prob(max_class) features.append((fname, ratio, posprob, negprob, max_class, pclass)) jsondata.save(filename, features) return features
def tlc_print_func(i, var): #print 'y: %s' % var.y print 'eta: %s' % var.eta print 'ss: %s' % var.sigma_squared if i % 5 == 0: jsondata.save('mytlc-output-%s.dat' % i, var.to_dict())
def slda_print_func(i, var): #print 'y: %s' % var.y print 'gamma: %s' % var.gamma print 'eta: %s' % var.eta print 'ss: %s' % var.sigma_squared if i % 5 == 0: jsondata.save('slda-output-%s.dat' % i, var.to_dict())
#!/usr/bin/env python """ converts liu opinion sentiment word lists into json format visualizing them Copyright (C) 2011 Joseph Perla GNU Affero General Public License. See <http://www.gnu.org/licenses/>. """ import codecs import jsondata filename = 'data/liu_neg_words.txt' f,ext = filename.rsplit('.', 1) lines = codecs.open(filename, 'r', 'utf8').readlines() real = [l.strip('\r\n ') for l in lines if not l.startswith(';')] real = [l for l in real if l] try: jsondata.save(f + '.json', real) except: import pdb;pdb.post_mortem()
# topic 015 in ls models/lda_c_2011_10_16/final.* is about restaurants and dining # I classified the 10k document collection and put those gammas in ccode/lda-c/classify10000-gamma.dat # topic 015 in ls models/lda_c_2011_10_16/final.* is about restaurants and dining # this will find the documents which have that topic import jsondata gamma_filename = 'models/lda_c_2011_10_16/final.gamma' gamma_filename = 'ccode/lda-c/classify10000-gamma.dat' gammas = [] with open(gamma_filename, 'r') as f: gammas = [[float(f) for f in g.split(' ')] for g in f.readlines()] restaurant_gammas = [(i,g) for i,g in enumerate(gammas) if g[15] > 100.0] restaurant_indices = [i for (i,g) in restaurant_gammas] data_filename = 'data/lda/nytimes_10000_sparse_lda_2011_10_16.dat' with open(data_filename, 'r') as f: docs = f.readlines() restaurant_docs = [docs[i] for i in restaurant_indices] sparse_docs = [[(int(e.split(':')[0]),int(e.split(':')[1])) for e in d.split(' ')[1:]] for d in restaurant_docs] import pdb; pdb.set_trace() jsondata.save('background.nyt_med.json', sparse_docs)
generate_complete_overlap, ] num_points = 10000 table = [] for pp in pps: for d in dists: pos, neg = d(num_points, pp) for c in cs: pos_sample, unlabeled = sample_positive(c, pos, neg) # validation set: v_p, v_u = sample_positive(c, *d(num_points, pp)) #v_p, v_u = d(num_points, pp) data = (pos_sample, unlabeled, v_p, v_u) #data, fixers = normalize_pu_data(*data) _, estimators = calculate_estimators(*data, max_iter=100) t = (pp, d.func_name, c,) + estimators print t table.append(t) #e1, e2, e3, e1_hat, e4_hat = estimators # save the table for graphing import jsondata jsondata.save('table.json', table)
num_points = 10000 table = [] for pp in pps: for d in dists: pos, neg = d(num_points, pp) for c in cs: pos_sample, unlabeled = sample_positive(c, pos, neg) # validation set: v_p, v_u = sample_positive(c, *d(num_points, pp)) #v_p, v_u = d(num_points, pp) data = (pos_sample, unlabeled, v_p, v_u) #data, fixers = normalize_pu_data(*data) _, estimators = calculate_estimators(*data, max_iter=100) t = ( pp, d.func_name, c, ) + estimators print t table.append(t) #e1, e2, e3, e1_hat, e4_hat = estimators # save the table for graphing import jsondata jsondata.save('table.json', table)
def lda_print_func(i, var): #print 'phi: %s' % var.phi print 'gamma: %s' % var.gamma if i % 5 == 0: jsondata.save('lda-output-%i.dat' % i, var.to_dict())