def plot_diurnal(headers): """ Diurnal plot of the emails, with years running along the x axis and times of day on the y axis. """ xday = [] ytime = [] print 'making diurnal plot...' for h in iterview(headers): if len(h) > 1: try: s = h[1][5:].strip() x = dateutil.parser.parse(s) except ValueError: print print marquee(' ERROR: skipping ') print h print marquee() continue timestamp = mktime(x.timetuple()) # convert datetime into floating point number mailstamp = datetime.fromtimestamp(timestamp) xday.append(mailstamp) # Time the email is arrived # Note that years, month and day are not important here. y = datetime(2010, 10, 14, mailstamp.hour, mailstamp.minute, mailstamp.second) ytime.append(y) plot_date(xday,ytime,'.',alpha=.7) xticks(rotation=30) return xday,ytime
def validate(model, iteration=None): def f1(data, name): print print 'Phrase-based F1:', name f1 = F1() for i, x in enumerate(iterview(data)): predict = extract_contiguous(model(x)) truth = extract_contiguous(x.truth) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in truth: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in predict: f1.add_retrieved(label, (i, begins, ends)) print return f1.scores(verbose=True) llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train) with lineplot('llh') as d: d.append(llh) print print 'likelihood:', llh print
def validate(model, iteration=None): def f1(data, name): print print 'Phrase-based F1:', name f1 = F1() for i, x in enumerate(iterview(data)): predict = extract_contiguous(model(x)) truth = extract_contiguous(x.truth) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in truth: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in predict: f1.add_retrieved(label, (i, begins, ends)) print return f1.scores(verbose=True) def weight_sparsity(W, t=0.0001): a = (np.abs(W) > t).sum() b = W.size print '%.2f (%s/%s) sparsity' % (a*100.0/b, a, b) f1(train, name='TRAIN') f1(test, name='TEST') print weight_sparsity(model.W) print print 'likelihood:', sum(map(crf.likelihood, iterview(train))) / len(train) print print
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print print red % ('#' + '_' * len(ff)) print red % ('#' + ff) print print('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8') print('%s: %s' % (yellow % 'meta', ' ; '.join( meta['author']))).encode('utf8') print try: yield (meta, d, pdfminer(filename)) except Exception: # XXX: silently skips examples which cause pdfminer to throw an # exception. pass
def validate(model, iteration=None): def f1(data, name): print print 'Phrase-based F1:', name f1 = F1() for i, x in enumerate(iterview(data)): predict = extract_contiguous(model(x)) truth = extract_contiguous(x.truth) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in truth: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in predict: f1.add_retrieved(label, (i, begins, ends)) print return f1.scores(verbose=True) def weight_sparsity(W, t=0.0001): a = (np.abs(W) > t).sum() b = W.size print '%.2f (%s/%s) sparsity' % (a * 100.0 / b, a, b) f1(train, name='TRAIN') f1(test, name='TEST') print weight_sparsity(model.W) print print 'likelihood:', sum(map(crf.likelihood, iterview(train))) / len(train) print print
def plot_diurnal(headers): """ Diurnal plot of the emails, with years running along the x axis and times of day on the y axis. """ xday = [] ytime = [] print 'making diurnal plot...' for h in iterview(headers): if len(h) > 1: try: s = h[1][5:].strip() x = dateutil.parser.parse(s) except ValueError: print print marquee(' ERROR: skipping ') print h print marquee() continue timestamp = mktime( x.timetuple()) # convert datetime into floating point number mailstamp = datetime.fromtimestamp(timestamp) xday.append(mailstamp) # Time the email is arrived # Note that years, month and day are not important here. y = datetime(2010, 10, 14, mailstamp.hour, mailstamp.minute, mailstamp.second) ytime.append(y) plot_date(xday, ytime, '.', alpha=.7) xticks(rotation=30) return xday, ytime
def validate(model, _): llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train) _, _, _, _, f = zip(*f1(train, 'train', model)) overall = 100 * np.mean(f) # equally weighted average F1 print() print(f'log-likelihood: {llh:g}') print(f'F1 overall: {overall:.2f}') print()
def predict(self,tokenlist): """ Takes a list of Tokens and returns the prediction on the data. """ for i, x in enumerate(iterview([tokenlist])): if x: predict = extract_contiguous(self.model(x)) else: return None return predict
def pdfs(pattern): "Import pdfs with file matching pattern." for source in iterview(glob(pattern)): if ' ' in source: print '[WARN] No spaces allowed in document source... renaming' newsource = source.replace(' ', '_') os.rename(source, newsource) source = newsource add.document(source=source, tags=[], interactive=False)
def delicious(xml): "Import links from delicious xml export. E.g. the output of delicious_import.py" with open(xml) as f: soup = BeautifulSoup(f) for post in iterview(soup.findAll('post')): print() add.document(source = post['href'], tags = post['tag'], title = post['description'], notes = post['extended'], interactive = False)
def pdfs(pattern): "Import pdfs with file matching pattern." for source in iterview(glob(pattern)): if ' ' in source: print('[WARN] No spaces allowed in document source... renaming') newsource = source.replace(' ', '_') os.rename(source, newsource) source = newsource add.document(source = source, tags = [], interactive = False)
def f1(data, name): print print 'Phrase-based F1:', name f1 = F1() for i, x in enumerate(iterview(data)): predict = extract_contiguous(model(x)) truth = extract_contiguous(x.truth) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in truth: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in predict: f1.add_retrieved(label, (i, begins, ends)) print return f1.scores(verbose=True)
def _test_binarize_unbinarize(trees): from arsenal.iterextras import iterview for t in iterview(list(trees)): b = binarize(t) # check that tree is indeed binary assert check_binary_tree(b), b # check roundtrip 'original tree'->'binary tree'->'original tree' u = unbinarize(b) assert u == t # unbinarize on an unbinarized tree should do nothing (other than copy # the tree) assert unbinarize(t) == t
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print print red % ('#' + '_' *len(ff)) print red % ('#' + ff) print print ('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8') print ('%s: %s' % (yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8') print yield (meta, d, pdfminer(filename))
def run_test(): [train, _] = partition(get_data('data/tagged_references.txt'), [0.01, 0.0]) (L, A) = build_domain(train) crf = StringCRF(L, A) print('Testing gradient of log-likelihood....') crf.preprocess(train) crf.W[:] = np.random.uniform(-1, 1, size=crf.W.shape) test_gradient(crf, train) # Chekc that we have enough features to overfit this small training set. crf.sgd(train, iterations=10) llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train) print(f'log-likelihood {llh:g}') _, _, _, _, f = zip(*f1(train, 'train', crf)) overall = 100 * np.mean(f) # equally weighted average F1 print(f'Overall F1 (train): {overall:.2f}')
def learn(data, test): labels = {x.label for x in data} w = {y: defaultdict(float) for y in labels} for t in iterview(range(10), every=1): # print # print # print 'Iteration', t alpha = 10.0 / (t + 1)**0.8 for x in data: y = predict(w, x.features) if x.label != y: for k in x.features: w[x.label][k] += alpha w[y][k] -= alpha # f1('train', data, w) # f1('test', test, w) return w
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print() print(colors.red % ('#' + '_' *len(ff))) print(colors.red % ('#' + ff)) print() print(('%s: %s' % (colors.yellow % 'meta', meta['title'])).encode('utf8')) print(('%s: %s' % (colors.yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8')) print() try: yield (meta, d, pdfminer(filename)) except Exception: # XXX: silently skips examples which cause pdfminer to throw an # exception. pass
def feature_label_freq_filter(data, c, threshold=5): for x in iterview(data, every=int(len(data)*.1)): y = x.label x.features = [k for k in x.features if c[y, k] >= threshold]
def main(): datafile = sys.argv[1] train, test = traintest(datafile) print('train: %s, test: %s' % (len(train), len(test))) from scipy.sparse import dok_matrix from sklearn import linear_model from sklearn.svm import SVC from arsenal.alphabet import Alphabet N_FEATURES = 100000 alphabet = Alphabet(random_int=N_FEATURES) def _f1(name, data, c, verbose=True): if verbose: print() print(name) f = F1() for (i, x) in enumerate(data): phi = dok_matrix((1, N_FEATURES)) for k in x.features: phi[0, alphabet[k] % N_FEATURES] = 1.0 [y] = c.predict(phi) f.report(i, y, x.label) f.scores(verbose=verbose) return f X = dok_matrix((len(train), N_FEATURES)) M = len(train) Y = [] X = dok_matrix((M, N_FEATURES)) for i, x in enumerate(train): # binary features for k in x.features: X[i, alphabet[k] % N_FEATURES] = 1.0 Y.append(x.label) X = X.tocsc() c = SVC(class_weight={'author': 1000, 'title': 1000, 'other': 1.0}, verbose=1) c.fit(X, Y) _f1('train', train, c) ff = _f1('test', test, c, verbose=1) if 0: import numpy as np import matplotlib.pyplot as pl from mpl_toolkits.mplot3d import Axes3D ax = pl.figure().add_subplot(111, projection='3d') pl.ion() data = [] for (author_weight, title_weight) in iterview(np.random.uniform(1, 10, size=(100, 2))): print() print('params:', (author_weight, title_weight)) c = SVC(class_weight={'author': author_weight, 'title': title_weight, 'other': 1.0}, verbose=1) #c = linear_model.SGDClassifier() c.fit(X, Y) #_f1('train', train, c) ff = _f1('test', test, c, verbose=1) score = sum(x for (_, _, _, _, x) in ff.scores(verbose=0)) data.append((author_weight, title_weight, score)) print('score:', score) x,y,z=list(zip(*data)) ax.clear() ax.scatter(x,y,z) ax.figure.canvas.draw() print('done') pl.ioff() pl.show()
def kl_filter(data, verbose=True, progress=False, out=sys.stdout, feature_label_cuttoff=0, feature_count_cuttoff=0, do_label_count=False): """ data = (label, [features ...]) KL is a synonym for Information Gain KL( p(label) || p(label|feature) ) """ (L, F, data) = integerize(data) if do_label_count: label_count = defaultdict(int) for label, _ in data: label_count[label] += 1 label_count = label_count.items() label_count.sort(key=lambda x: -x[1]) # sort by count print 'label count' for k,v in label_count: print '%20s => %s' % (k, v) sys.exit(0) K = len(L) M = len(F) if progress: from arsenal.iterextras import iterview else: iterview = lambda x, *a, **kw: x if progress: print >> sys.stderr, '\nTally' # label-feature tally (note: we ignore dulicate features) counts = zeros((K,M)) for y, fv in iterview(data): counts[y, fv] += 1 feature_counts = counts.sum(axis=0) if feature_count_cuttoff > 0: cut = feature_counts < feature_count_cuttoff #if verbose: print >> sys.stderr, '%s of %s below cutoff of %s' \ % (cut.sum(), len(feature_counts), feature_count_cuttoff) if progress: print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \ (cut.sum(), M, cut.sum()*100.0/M) # zero-out features below cuttoff counts[:, cut] = 0 if feature_label_cuttoff: cut = counts < feature_label_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \ (cut.sum(), K*M, cut.sum()*100.0/(K*M)) # zero-out features below cuffoff counts[cut] = 0 label_prior = lidstone(counts.sum(axis=1), 0.001) # avoids divide-by-zero # compute KL if progress: print >> sys.stderr, '\nKL' KL = zeros(M) for f in iterview(xrange(M)): label_given_f = lidstone(counts[:,f], 0.001) # avoids divide-by-zero KL[f] = -kl_divergence(label_prior, label_given_f) # print KL-feature, most-informative first for i in KL.argsort(): z = counts[:,i].sum() if z == 0: continue p = counts[:,i] * 1.0 / z l = [(v, k) for k,v in zip(L, p) if v > 0] l.sort() z = (-KL[i], F.lookup(i), l) if verbose: print >> out, '%8.6f\t%s\t%s' % (-KL[i], int(counts[:,i].sum()), F.lookup(i)), '\t\033[32m', ' '.join('%s(%.4f)' % (k,v) for v, k in l), '\033[0m' yield z
def ransac(data, model, n, k, t, d, debug=False): """ fit model parameters to data using the RANSAC algorithm This implementation written from pseudocode found at http://en.wikipedia.org/w/index.php?title=RANSAC&oldid=116358182 Given: data - a set of observed data points model - a model that can be fitted to data points n - the minimum number of data values required to fit the model k - the maximum number of iterations allowed in the algorithm t - a threshold value for determining when a data point fits a model d - the number of close data values required to assert that a model fits well to data Return: bestfit - model parameters which best fit the data (or None if no good model is found) """ bestfit = None besterr = numpy.inf best_inlier_idxs = None for i in iterview(xrange(k), 250): # randomly partition data (random_partition returns two arrays of ids) maybe_idxs, test_idxs = random_partition(n, data.shape[0]) # get data points for each id maybeinliers = data[maybe_idxs, :] test_points = data[test_idxs] # fit model and check error on the test points maybemodel = model.fit(maybeinliers) test_err = model.get_error(test_points, maybemodel) # pick indices of test_points with acceptable error (below threshold) also_idxs = test_idxs[test_err < t] alsoinliers = data[also_idxs, :] if debug: print 'test_err.min()', test_err.min() print 'test_err.max()', test_err.max() print 'numpy.mean(test_err)', numpy.mean(test_err) print 'iteration %d: len(alsoinliers) = %d' % (i, len(alsoinliers)) # Do we have enough values not included in the fit-partition to assert that # maybemodel fits well-enough? if len(alsoinliers) > d: betterdata = numpy.concatenate((maybeinliers, alsoinliers)) bettermodel = model.fit(betterdata) better_errs = model.get_error(betterdata, bettermodel) # SSE per row thiserr = numpy.mean(better_errs) # only keep the best model, error, and data if thiserr < besterr: bestfit = bettermodel besterr = thiserr best_inlier_idxs = numpy.concatenate((maybe_idxs, also_idxs)) if bestfit is None: raise ValueError("did not meet fit acceptance criteria") return bestfit, {'inliers': best_inlier_idxs}
def kl_filter(data, verbose=True, progress=False, out=sys.stdout): """ data = (label, [features ...]) KL is a synonym for Information Gain KL( p(label) || p(label|feature) ) """ (L, F, data) = integerize(data) K = len(L) M = len(F) if progress: from arsenal.iterextras import iterview else: iterview = lambda x, *a, **kw: x if progress: print >> sys.stderr, '\nTally' # label-feature tally (note: we ignore dulicate features) counts = zeros((K,M)) for y, fv in iterview(data, every=5000): counts[y, fv] += 1 feature_counts = counts.sum(axis=0) if feature_count_cuttoff: cut = feature_counts < feature_count_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \ (cut.sum(), M, cut.sum()*100.0/M) # zero-out features below cuttoff counts[:, cut] = 0 if feature_label_cuttoff: cut = counts < feature_label_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \ (cut.sum(), K*M, cut.sum()*100.0/(K*M)) # zero-out features below cuffoff counts[cut] = 0 label_prior = normalize(counts.sum(axis=1)) # compute KL if progress: print >> sys.stderr, '\nKL' KL = zeros(M) for f in iterview(xrange(M), every=5000): label_given_f = lidstone(counts[:,f], 0.00001) # avoids divide-by-zero KL[f] = -kl_divergence(label_prior, label_given_f) # print KL-feature, most-informative first for i in KL.argsort(): p = counts[:,i] * 1.0 / counts[:,i].sum() l = [(v, k) for k,v in zip(L, p) if v > 0] l.sort() z = (-KL[i], F.lookup(i), l) if verbose: print >> out, '%8.6f\t%s' % (-KL[i], F.lookup(i)), '\t\033[32m', ' '.join('%s(%s)' % (k,v) for v, k in l), '\033[0m' yield z
def ransac(data, model, n, k, t, d, debug=False): """ fit model parameters to data using the RANSAC algorithm This implementation written from pseudocode found at http://en.wikipedia.org/w/index.php?title=RANSAC&oldid=116358182 Given: data - a set of observed data points model - a model that can be fitted to data points n - the minimum number of data values required to fit the model k - the maximum number of iterations allowed in the algorithm t - a threshold value for determining when a data point fits a model d - the number of close data values required to assert that a model fits well to data Return: bestfit - model parameters which best fit the data (or None if no good model is found) """ bestfit = None besterr = numpy.inf best_inlier_idxs = None for i in iterview(xrange(k), 250): # randomly partition data (random_partition returns two arrays of ids) maybe_idxs, test_idxs = random_partition(n, data.shape[0]) # get data points for each id maybeinliers = data[maybe_idxs,:] test_points = data[test_idxs] # fit model and check error on the test points maybemodel = model.fit(maybeinliers) test_err = model.get_error(test_points, maybemodel) # pick indices of test_points with acceptable error (below threshold) also_idxs = test_idxs[test_err < t] alsoinliers = data[also_idxs,:] if debug: print 'test_err.min()', test_err.min() print 'test_err.max()', test_err.max() print 'numpy.mean(test_err)', numpy.mean(test_err) print 'iteration %d: len(alsoinliers) = %d' % (i, len(alsoinliers)) # Do we have enough values not included in the fit-partition to assert that # maybemodel fits well-enough? if len(alsoinliers) > d: betterdata = numpy.concatenate((maybeinliers, alsoinliers)) bettermodel = model.fit(betterdata) better_errs = model.get_error(betterdata, bettermodel) # SSE per row thiserr = numpy.mean(better_errs) # only keep the best model, error, and data if thiserr < besterr: bestfit = bettermodel besterr = thiserr best_inlier_idxs = numpy.concatenate((maybe_idxs, also_idxs)) if bestfit is None: raise ValueError("did not meet fit acceptance criteria") return bestfit, {'inliers': best_inlier_idxs}
def main(): datafile = sys.argv[1] train, test = traintest(datafile) print 'train: %s, test: %s' % (len(train), len(test)) from scipy.sparse import dok_matrix from sklearn import linear_model from sklearn.svm import SVC from arsenal.alphabet import Alphabet N_FEATURES = 100000 alphabet = Alphabet(random_int=N_FEATURES) def _f1(name, data, c, verbose=True): if verbose: print print name f = F1() for (i, x) in enumerate(data): phi = dok_matrix((1, N_FEATURES)) for k in x.features: phi[0, alphabet[k] % N_FEATURES] = 1.0 [y] = c.predict(phi) f.report(i, y, x.label) f.scores(verbose=verbose) return f X = dok_matrix((len(train), N_FEATURES)) M = len(train) Y = [] X = dok_matrix((M, N_FEATURES)) for i, x in enumerate(train): # binary features for k in x.features: X[i, alphabet[k] % N_FEATURES] = 1.0 Y.append(x.label) X = X.tocsc() c = SVC(class_weight={ 'author': 1000, 'title': 1000, 'other': 1.0 }, verbose=1) c.fit(X, Y) _f1('train', train, c) ff = _f1('test', test, c, verbose=1) if 0: import numpy as np import matplotlib.pyplot as pl from mpl_toolkits.mplot3d import Axes3D ax = pl.figure().add_subplot(111, projection='3d') pl.ion() data = [] for (author_weight, title_weight) in iterview(np.random.uniform(1, 10, size=(100, 2))): print print 'params:', (author_weight, title_weight) c = SVC(class_weight={ 'author': author_weight, 'title': title_weight, 'other': 1.0 }, verbose=1) #c = linear_model.SGDClassifier() c.fit(X, Y) #_f1('train', train, c) ff = _f1('test', test, c, verbose=1) score = sum(x for (_, _, _, _, x) in ff.scores(verbose=0)) data.append((author_weight, title_weight, score)) print 'score:', score x, y, z = zip(*data) ax.clear() ax.scatter(x, y, z) ax.figure.canvas.draw() print 'done' pl.ioff() pl.show()
def feature_label_freq_filter(data, c, threshold=5): for x in iterview(data, every=int(len(data) * .1)): y = x.label x.features = [k for k in x.features if c[y, k] >= threshold]
def kl_filter(data, verbose=True, progress=False, out=sys.stdout): """ data = (label, [features ...]) KL is a synonym for Information Gain KL( p(label) || p(label|feature) ) """ (L, F, data) = integerize(data) K = len(L) M = len(F) if progress: from arsenal.iterextras import iterview else: iterview = lambda x, *a, **kw: x if progress: print >> sys.stderr, '\nTally' # label-feature tally (note: we ignore dulicate features) counts = zeros((K, M)) for y, fv in iterview(data, every=5000): counts[y, fv] += 1 feature_counts = counts.sum(axis=0) if feature_count_cuttoff: cut = feature_counts < feature_count_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \ (cut.sum(), M, cut.sum()*100.0/M) # zero-out features below cuttoff counts[:, cut] = 0 if feature_label_cuttoff: cut = counts < feature_label_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \ (cut.sum(), K*M, cut.sum()*100.0/(K*M)) # zero-out features below cuffoff counts[cut] = 0 label_prior = normalize(counts.sum(axis=1)) # compute KL if progress: print >> sys.stderr, '\nKL' KL = zeros(M) for f in iterview(xrange(M), every=5000): label_given_f = lidstone(counts[:, f], 0.00001) # avoids divide-by-zero KL[f] = -kl_divergence(label_prior, label_given_f) # print KL-feature, most-informative first for i in KL.argsort(): p = counts[:, i] * 1.0 / counts[:, i].sum() l = [(v, k) for k, v in zip(L, p) if v > 0] l.sort() z = (-KL[i], F.lookup(i), l) if verbose: print >> out, '%8.6f\t%s' % (-KL[i], F.lookup(i)), '\t\033[32m', ' '.join( '%s(%s)' % (k, v) for v, k in l), '\033[0m' yield z