#!/usr/bin/env python import json import numpy as np import matplotlib.pyplot as plt from matplotlib import rc import jsondata if __name__=='__main__': rc('text', usetex=True) data = jsondata.read('table.json') pps = [0.1, 0.5, 0.9,] dd = [('generate_well_separable', 'Well Separated'), ('generate_mostly_separable', 'Mostly Separated'), ('generate_some_overlap', 'Some Overlap'), ('generate_complete_overlap', 'Complete Overlap'), ] for distributions,name in dd: fig, axs = plt.subplots(3, sharex=True, sharey=True) # Three subplots sharing both x/y axes for i,pp in enumerate(pps): rows = [r[2:] for r in data if r[0] == pp and r[1] == distributions] d = np.array(rows).T d.sort(axis=1) assert d.shape == (6, 4)
#!/usr/bin/env python import numpy as np import jsondata import tlc from itertools import izip if __name__=='__main__': num_labeled = 9994 cut_short = 100000000000 # use my tlc synthetically generated dataset documents = jsondata.read('data/documents.dc.nyt.json')[:cut_short] comments = jsondata.read('data/comments.dc.nyt.json')[:cut_short] labeled_documents = jsondata.read('data/yelp.nyt_med.json')[:num_labeled][:cut_short] background = jsondata.read('data/background.nyt_med.json')[:cut_short] y = jsondata.read('data/yelp.labels.json')[:num_labeled][:cut_short] y = [(i - 3.0) for i in y] # center around 0 real_data = (documents, comments, labeled_documents, background, y) var = tlc.TLCVars(real_data, Ku=25, Ks=5, Kb=25) var.eta = np.array([3.0, 1.5, 0.5, -1.5, -3.0]) try: output = tlc.run_tlc(var) except Exception,e: print e
#!/usr/bin/env python import jsondata num_docs = 10000 labeled_documents = jsondata.read('data/yelp.nyt_med.json')[:num_docs] for i,l in enumerate(labeled_documents): if len(l) == 0: print i
second row matches to column and contains count of term in doc. """ def matrix(d): """Accepts dictionary as above. Returns 2-row matrix.""" # todo: i think this uses way too much memory elements = list(itertools.chain(*d.iteritems())) return r.matrix(ro.IntVector(elements), nrow=2) matrices = [matrix(d) for d in features] return matrices # return r.list(matrices) if __name__ == "__main__": lexicon = dict([(a, i) for i, a in enumerate(jsondata.read("data/nytimes_med_common_vocab.json"))]) """ db = None try: import pymongo db = pymongo.Connection('localhost', 27017).nytimes except: print 'did not connect to mongo; not running' docs_with_comments = list(db.article.find({'num_comments':{'$gt': 0}}).sort([('pubdate', -1)])) dwc = docs_with_comments titles = [] docs = []
def read_yelp_reviews(): """Returns generator of dicts of reviews from yelp datset.""" for d in jsondata.read("data/yelp_academic_dataset.json"): if d["type"] == "review": yield d
Copyright (C) 2011 Joseph Perla GNU Affero General Public License. See <http://www.gnu.org/licenses/>. """ import glob import numpy as np import jsondata import ppc if __name__=='__main__': s = 'midterm/mytlc-output-15-%s' eta = jsondata.read(glob.glob(s % 'eta*')[0]) sigma_squared = jsondata.read(glob.glob(s % 'sigma_squared*')[0])[0] beta = jsondata.read(glob.glob(s % 'beta*')[0]) phi = jsondata.read(glob.glob(s % 'phiC*')[0]) # cut beta down to what we need Nd,Kc = phi[0].shape beta = beta[:Kc,:] print 'finished reading in params...' global_params = {'eta': eta, 'beta': beta, 'sigma_squared': sigma_squared} local_params = [{'phi': p} for p in phi] comments = jsondata.read('data/comments.dc.nyt.json') print 'finished reading in docs...' p = ppc.YelpSentimentTLCPPC()
Looks inside json data. Prints out first few lines of words. Useful for making sure I have the data I want. Copyright (C) 2011 Joseph Perla GNU Affero General Public License. See <http://www.gnu.org/licenses/>. """ import sys import jsondata if __name__=='__main__': data_filename = sys.argv[1] vocab_filename = sys.argv[2] num_docs = int(sys.argv[3]) words_per_doc = int(sys.argv[4]) associated_filename = sys.argv[5] if len(sys.argv) > 5 else None data = jsondata.read(data_filename)[:num_docs] lexicon = jsondata.read(vocab_filename) words = [[lexicon[w] for (w,c) in sorted(doc, key=lambda w:-w[1])][:words_per_doc] for doc in data] if associated_filename is not None: associated = jsondata.read(associated_filename)[:num_docs] for i in xrange(num_docs): if associated_filename is not None: print associated[i], words[i] else: print words[i]
#!/usr/bin/env python import jsondata from inspect_slda_model import predict phi_filename = '../balancedtlc/mytlc-output-20-phiC.dat.npy.list.npz' vocab_filename = '' eta_filename = '../balancedtlc/mytlc-output-20-eta.dat.npy.gz' titles_filename = '../data/titles.dc.nyt.json' phi = jsondata.read(phi_filename) eta = jsondata.read(eta_filename) titles = jsondata.read(titles_filename) print 'read in data...' predicted_ratings = list(sorted((i, predict(eta, p)) for i,p in enumerate(phi))) print 'predicted ratings...' import csv reader = csv.reader(open('gold.csv', 'r')) #v = [i[0] for i in predicted_ratings] #import pdb; pdb.set_trace() rall = [] for line in reader: index = int(line[1]) mean = float(line[2])
code i used to generate histograms of some data visualizing them Copyright (C) 2011 Joseph Perla GNU Affero General Public License. See <http://www.gnu.org/licenses/>. """ import pylab import vlex import jsondata f = 'data/yelp_4cat_naive_full_mytoken_74.json' f = 'data/yelp_4cat_naive_full_standardtoken.json' f = 'data/yelp_2cat_naive_full_mytoken_783.json' data = list(jsondata.read(f)) words = vlex.parse_bayes_into_scores(data) values = [w[1] for w in words] #remove the modes, +/-.75 #values = [v for v in values if abs(v) != .75] values = [v for v in values if 30 > abs(v) and abs(v) != 3] pylab.hist(values, bins=50) pylab.show() ''' '''
GNU Affero General Public License. See <http://www.gnu.org/licenses/>. """ import glob import numpy as np import jsondata import ppc if __name__=='__main__': s = 'midterm/mytlc-output-20-%s' name = 'tlc-pslda' eta = jsondata.read(glob.glob(s % 'eta*')[0]) sigma_squared = jsondata.read(glob.glob(s % 'sigma_squared*')[0])[0] beta = jsondata.read(glob.glob(s % 'beta*')[0]) phi = jsondata.read(glob.glob(s % 'phiC*')[0]) # cut beta down to what we need Nd,Kc = phi[0].shape beta = beta[:Kc,:] # in comments, last phi is the sentiment data Ks = len(eta) phi = [p[:,-Ks:] for p in phi] print 'finished reading in params...' global_params = {'eta': eta, 'beta': beta, 'sigma_squared': sigma_squared} local_params = [{'phi': p} for p in phi]
Ks = len(eta) N,K = phi.shape phi = phi[:,-Ks:] EZ = np.sum(phi, axis=0) / N return np.dot(eta, EZ) if __name__=='__main__': phi_filename = sys.argv[1] vocab_filename = sys.argv[2] num_docs = int(sys.argv[3]) eta_filename = sys.argv[4] associated_filename = sys.argv[5] phi = jsondata.read(phi_filename) lexicon = jsondata.read(vocab_filename) eta = None if eta_filename is not None: eta = jsondata.read(eta_filename) print 'eta: %s' % eta if associated_filename is not None: associated = jsondata.read(associated_filename) print 'read in data...' predicted_ratings = list(sorted((predict(eta, p),i) for i,p in enumerate(phi))) print 'predicted ratings...'
raise NotImplementedError def discrepancy(self, posterior, observed): """Accepts posterior, which is dictionary of phi, beta, eta, sigma squared. Observed is a sparse vector of word, list of (word int,count) 2-tuples. Returns a real number. Just uses observed and posterior norm divided by sigma squared. """ #TODO: jperla: maybe can generalize, sigma is a def standardizer() ? s = np.sqrt(posterior['sigma_squared']) return abs(self.posterior_norm(posterior) - self.observed_norm(observed)) / s vocab = dict((w,i) for i,w in enumerate(jsondata.read('../data/nytimes_med_common_vocab.json'))) pos = jsondata.read('../data/liu_pos_words.json') neg = jsondata.read('../data/liu_neg_words.json') posi = set([vocab[w] for w in pos if w in vocab]) negi = set([vocab[w] for w in neg if w in vocab]) class YelpSentimentPartialSLDAPPC(TLCPPC): def simulate(self, posterior, observed): """Accepts posterior vars which include phi and eta. As well as observed value which is just a real number. Returns a new observation. Observation is from a normal from expected mean, like regression. """ s = np.sqrt(posterior['sigma_squared'])
Copyright (C) 2011 Joseph Perla GNU Affero General Public License. See <http://www.gnu.org/licenses/>. """ import ppc import numpy as np import jsondata if __name__=='__main__': s = 'medsldamodel/med-slda.final-%s.dat' eta = np.array(jsondata.read(s % 'eta')) beta = np.array(jsondata.read(s % 'beta')) phi = [np.array(p) for p in jsondata.read(s % 'phi')] sigma_squared = jsondata.read(s % 'sigma_squared')[0] print 'finished reading in params...' global_params = {'eta': eta, 'beta': beta, 'sigma_squared': sigma_squared} local_params = [{'phi': p} for p in phi] # get the data num_docs = 1000 #labeled_documents = jsondata.read('data/yelp.nyt_med.json')[:num_docs] y = jsondata.read('data/yelp.labels.json')[:num_docs] #// filter out documents with no words #all_data = [(l,y) for l,y in izip(labeled_documents,y) if len(l) > 0]
#!/usr/bin/env python import json import numpy as np import matplotlib.pyplot as plt from matplotlib import rc import jsondata if __name__ == '__main__': rc('text', usetex=True) data = jsondata.read('table.json') pps = [ 0.1, 0.5, 0.9, ] dd = [ ('generate_well_separable', 'Well Separated'), ('generate_mostly_separable', 'Mostly Separated'), ('generate_some_overlap', 'Some Overlap'), ('generate_complete_overlap', 'Complete Overlap'), ] for distributions, name in dd: fig, axs = plt.subplots(3, sharex=True, sharey=True) # Three subplots sharing both x/y axes
def describe_doc(data_filename, vocab_filename, docid): counts = [f for f in (open(data_filename).readlines()[docid]).split(' ')][1:] wordids = [int(c.split(':')[0]) for c in counts] vocab = list(jsondata.read(vocab_filename)) words = [vocab[i] for i in wordids] return sorted(words)
def grab_topic(beta_filename, vocab_filename, topicid): counts = [float(f) for f in (open(beta_filename).readlines()[topicid]).split(' ')] minimum = min(counts) # to ignore very irrelevant words vocab = list(jsondata.read(vocab_filename)) words = [(vocab[i], p) for i,p in enumerate(counts) if p > minimum] return sorted(words, key=lambda v:-v[1])