def plot_diurnal(headers): """ Diurnal plot of the emails, with years running along the x axis and times of day on the y axis. """ xday = [] ytime = [] print 'making diurnal plot...' for h in iterview(headers): if len(h) > 1: try: s = h[1][5:].strip() x = dateutil.parser.parse(s) except ValueError: print print marquee(' ERROR: skipping ') print h print marquee() continue timestamp = mktime(x.timetuple()) # convert datetime into floating point number mailstamp = datetime.fromtimestamp(timestamp) xday.append(mailstamp) # Time the email is arrived # Note that years, month and day are not important here. y = datetime(2010, 10, 14, mailstamp.hour, mailstamp.minute, mailstamp.second) ytime.append(y) plot_date(xday,ytime,'.',alpha=.7) xticks(rotation=30) return xday,ytime
def f1(data, name): print print 'Phrase-based F1:', name f1 = F1() for i, x in enumerate(iterview(data)): predict = extract_contiguous(model(x)) truth = extract_contiguous(x.truth) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in truth: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in predict: f1.add_retrieved(label, (i, begins, ends)) print return f1.scores(verbose=True)
def validate(model, iteration=None): def f1(data, name): print print 'Phrase-based F1:', name f1 = F1() for i, x in enumerate(iterview(data)): predict = extract_contiguous(model(x)) truth = extract_contiguous(x.truth) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in truth: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in predict: f1.add_retrieved(label, (i, begins, ends)) print return f1.scores(verbose=True) f1(train, name='TRAIN') f1(test, name='TEST') print print 'likelihood:', sum(map(crf.likelihood, iterview(train))) / len(train) print print
def kl_filter(data, verbose=True, progress=False, out=sys.stdout): """ data = (label, [features ...]) KL is a synonym for Information Gain KL( p(label) || p(label|feature) ) """ (L, F, data) = integerize(data) K = len(L) M = len(F) if progress: from iterextras import iterview else: iterview = lambda x, *a, **kw: x if progress: print >> sys.stderr, '\nTally' # label-feature tally (note: we ignore dulicate features) counts = zeros((K,M)) for y, fv in iterview(data, every=5000): counts[y, fv] += 1 feature_counts = counts.sum(axis=0) if feature_count_cuttoff: cut = feature_counts < feature_count_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \ (cut.sum(), M, cut.sum()*100.0/M) # zero-out features below cuttoff counts[:, cut] = 0 if feature_label_cuttoff: cut = counts < feature_label_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \ (cut.sum(), K*M, cut.sum()*100.0/(K*M)) # zero-out features below cuffoff counts[cut] = 0 label_prior = normalize(counts.sum(axis=1)) # compute KL if progress: print >> sys.stderr, '\nKL' KL = zeros(M) for f in iterview(xrange(M), every=5000): label_given_f = lidstone(counts[:,f], 0.00001) # avoids divide-by-zero KL[f] = -kl_divergence(label_prior, label_given_f) # print KL-feature, most-informative first for i in KL.argsort(): p = counts[:,i] * 1.0 / counts[:,i].sum() l = [(v, k) for k,v in zip(L, p) if v > 0] l.sort() z = (-KL[i], F.lookup(i), l) if verbose: print >> out, '%8.6f\t%s' % (-KL[i], F.lookup(i)), '\t\033[32m', ' '.join('%s(%s)' % (k,v) for v, k in l), '\033[0m' yield z