Пример #1
0
def plot_diurnal(headers):
    """
    Diurnal plot of the emails, with years running along the x axis and times of
    day on the y axis.
    """
    xday = []
    ytime = []
    print 'making diurnal plot...'
    for h in iterview(headers):
        if len(h) > 1:
            try:
                s = h[1][5:].strip()
                x = dateutil.parser.parse(s)
            except ValueError:
                print
                print marquee(' ERROR: skipping ')
                print h
                print marquee()
                continue
            timestamp = mktime(x.timetuple())   # convert datetime into floating point number
            mailstamp = datetime.fromtimestamp(timestamp)
            xday.append(mailstamp)
            # Time the email is arrived
            # Note that years, month and day are not important here.
            y = datetime(2010, 10, 14, mailstamp.hour, mailstamp.minute, mailstamp.second)
            ytime.append(y)
    plot_date(xday,ytime,'.',alpha=.7)
    xticks(rotation=30)
    return xday,ytime
Пример #2
0
 def f1(data, name):
     print
     print 'Phrase-based F1:', name
     f1 = F1()
     for i, x in enumerate(iterview(data)):
         predict = extract_contiguous(model(x))
         truth = extract_contiguous(x.truth)
         # (i,begin,end) uniquely identifies the span
         for (label, begins, ends) in truth:
             f1.add_relevant(label, (i, begins, ends))
         for (label, begins, ends) in predict:
             f1.add_retrieved(label, (i, begins, ends))
     print
     return f1.scores(verbose=True)
Пример #3
0
    def validate(model, iteration=None):

        def f1(data, name):
            print
            print 'Phrase-based F1:', name
            f1 = F1()
            for i, x in enumerate(iterview(data)):
                predict = extract_contiguous(model(x))
                truth = extract_contiguous(x.truth)
                # (i,begin,end) uniquely identifies the span
                for (label, begins, ends) in truth:
                    f1.add_relevant(label, (i, begins, ends))
                for (label, begins, ends) in predict:
                    f1.add_retrieved(label, (i, begins, ends))
            print
            return f1.scores(verbose=True)

        f1(train, name='TRAIN')
        f1(test, name='TEST')

        print
        print 'likelihood:', sum(map(crf.likelihood, iterview(train))) / len(train)
        print
        print
Пример #4
0
def kl_filter(data, verbose=True, progress=False, out=sys.stdout):
    """
    data = (label, [features ...])

    KL is a synonym for Information Gain

    KL( p(label) || p(label|feature) )
    """
    (L, F, data) = integerize(data)

    K = len(L)
    M = len(F)

    if progress:
        from iterextras import iterview
    else:
        iterview = lambda x, *a, **kw: x

    if progress:
        print >> sys.stderr, '\nTally'

    # label-feature tally (note: we ignore dulicate features)
    counts = zeros((K,M))
    for y, fv in iterview(data, every=5000):
        counts[y, fv] += 1

    feature_counts = counts.sum(axis=0)

    if feature_count_cuttoff:
        cut = feature_counts < feature_count_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \
                (cut.sum(), M, cut.sum()*100.0/M)

        # zero-out features below cuttoff
        counts[:, cut] = 0

    if feature_label_cuttoff:
        cut = counts < feature_label_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \
                (cut.sum(), K*M, cut.sum()*100.0/(K*M))

        # zero-out features below cuffoff
        counts[cut] = 0

    label_prior = normalize(counts.sum(axis=1))

    # compute KL
    if progress:
        print >> sys.stderr, '\nKL'

    KL = zeros(M)
    for f in iterview(xrange(M), every=5000):
        label_given_f = lidstone(counts[:,f], 0.00001)   # avoids divide-by-zero
        KL[f] = -kl_divergence(label_prior, label_given_f)

    # print KL-feature, most-informative first
    for i in KL.argsort():
        p = counts[:,i] * 1.0 / counts[:,i].sum()

        l = [(v, k) for k,v in zip(L, p) if v > 0]
        l.sort()

        z = (-KL[i], F.lookup(i), l)

        if verbose:
            print >> out, '%8.6f\t%s' % (-KL[i], F.lookup(i)), '\t\033[32m', ' '.join('%s(%s)' % (k,v) for v, k in l), '\033[0m'

        yield z