Пример #1
0
def plot_diurnal(headers):
    """
    Diurnal plot of the emails, with years running along the x axis and times of
    day on the y axis.
    """
    xday = []
    ytime = []
    print 'making diurnal plot...'
    for h in iterview(headers):
        if len(h) > 1:
            try:
                s = h[1][5:].strip()
                x = dateutil.parser.parse(s)
            except ValueError:
                print
                print marquee(' ERROR: skipping ')
                print h
                print marquee()
                continue
            timestamp = mktime(x.timetuple())   # convert datetime into floating point number
            mailstamp = datetime.fromtimestamp(timestamp)
            xday.append(mailstamp)
            # Time the email is arrived
            # Note that years, month and day are not important here.
            y = datetime(2010, 10, 14, mailstamp.hour, mailstamp.minute, mailstamp.second)
            ytime.append(y)
    plot_date(xday,ytime,'.',alpha=.7)
    xticks(rotation=30)
    return xday,ytime
Пример #2
0
    def validate(model, iteration=None):

        def f1(data, name):
            print
            print 'Phrase-based F1:', name
            f1 = F1()
            for i, x in enumerate(iterview(data)):
                predict = extract_contiguous(model(x))
                truth = extract_contiguous(x.truth)
                # (i,begin,end) uniquely identifies the span
                for (label, begins, ends) in truth:
                    f1.add_relevant(label, (i, begins, ends))
                for (label, begins, ends) in predict:
                    f1.add_retrieved(label, (i, begins, ends))
            print
            return f1.scores(verbose=True)

        llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train)

        with lineplot('llh') as d:
            d.append(llh)

        print
        print 'likelihood:', llh
        print
Пример #3
0
    def validate(model, iteration=None):

        def f1(data, name):
            print
            print 'Phrase-based F1:', name
            f1 = F1()
            for i, x in enumerate(iterview(data)):
                predict = extract_contiguous(model(x))
                truth = extract_contiguous(x.truth)
                # (i,begin,end) uniquely identifies the span
                for (label, begins, ends) in truth:
                    f1.add_relevant(label, (i, begins, ends))
                for (label, begins, ends) in predict:
                    f1.add_retrieved(label, (i, begins, ends))
            print
            return f1.scores(verbose=True)

        def weight_sparsity(W, t=0.0001):
            a = (np.abs(W) > t).sum()
            b = W.size
            print '%.2f (%s/%s) sparsity' % (a*100.0/b, a, b)

        f1(train, name='TRAIN')
        f1(test, name='TEST')

        print
        weight_sparsity(model.W)
        print
        print 'likelihood:', sum(map(crf.likelihood, iterview(train))) / len(train)
        print
        print
Пример #4
0
    def validate(model, iteration=None):

        def f1(data, name):
            print
            print 'Phrase-based F1:', name
            f1 = F1()
            for i, x in enumerate(iterview(data)):
                predict = extract_contiguous(model(x))
                truth = extract_contiguous(x.truth)
                # (i,begin,end) uniquely identifies the span
                for (label, begins, ends) in truth:
                    f1.add_relevant(label, (i, begins, ends))
                for (label, begins, ends) in predict:
                    f1.add_retrieved(label, (i, begins, ends))
            print
            return f1.scores(verbose=True)

        llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train)

        with lineplot('llh') as d:
            d.append(llh)

        print
        print 'likelihood:', llh
        print
Пример #5
0
def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print
                print red % ('#' + '_' * len(ff))
                print red % ('#' + ff)
                print
                print('%s: %s' %
                      (yellow % 'meta', meta['title'])).encode('utf8')
                print('%s: %s' % (yellow % 'meta', ' ; '.join(
                    meta['author']))).encode('utf8')
                print
            try:
                yield (meta, d, pdfminer(filename))
            except Exception:
                # XXX: silently skips examples which cause pdfminer to throw an
                # exception.
                pass
Пример #6
0
    def validate(model, iteration=None):
        def f1(data, name):
            print
            print 'Phrase-based F1:', name
            f1 = F1()
            for i, x in enumerate(iterview(data)):
                predict = extract_contiguous(model(x))
                truth = extract_contiguous(x.truth)
                # (i,begin,end) uniquely identifies the span
                for (label, begins, ends) in truth:
                    f1.add_relevant(label, (i, begins, ends))
                for (label, begins, ends) in predict:
                    f1.add_retrieved(label, (i, begins, ends))
            print
            return f1.scores(verbose=True)

        def weight_sparsity(W, t=0.0001):
            a = (np.abs(W) > t).sum()
            b = W.size
            print '%.2f (%s/%s) sparsity' % (a * 100.0 / b, a, b)

        f1(train, name='TRAIN')
        f1(test, name='TEST')

        print
        weight_sparsity(model.W)
        print
        print 'likelihood:', sum(map(crf.likelihood,
                                     iterview(train))) / len(train)
        print
        print
Пример #7
0
def plot_diurnal(headers):
    """
    Diurnal plot of the emails, with years running along the x axis and times of
    day on the y axis.
    """
    xday = []
    ytime = []
    print 'making diurnal plot...'
    for h in iterview(headers):
        if len(h) > 1:
            try:
                s = h[1][5:].strip()
                x = dateutil.parser.parse(s)
            except ValueError:
                print
                print marquee(' ERROR: skipping ')
                print h
                print marquee()
                continue
            timestamp = mktime(
                x.timetuple())  # convert datetime into floating point number
            mailstamp = datetime.fromtimestamp(timestamp)
            xday.append(mailstamp)
            # Time the email is arrived
            # Note that years, month and day are not important here.
            y = datetime(2010, 10, 14, mailstamp.hour, mailstamp.minute,
                         mailstamp.second)
            ytime.append(y)
    plot_date(xday, ytime, '.', alpha=.7)
    xticks(rotation=30)
    return xday, ytime
Пример #8
0
 def validate(model, _):
     llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train)
     _, _, _, _, f = zip(*f1(train, 'train', model))
     overall = 100 * np.mean(f)  # equally weighted average F1
     print()
     print(f'log-likelihood: {llh:g}')
     print(f'F1 overall: {overall:.2f}')
     print()
Пример #9
0
    def predict(self,tokenlist):
        """ Takes a list of Tokens and returns the prediction on the data. """

        for i, x in enumerate(iterview([tokenlist])):
            if x:
                predict = extract_contiguous(self.model(x))
            else:
                return None
        return predict
Пример #10
0
def pdfs(pattern):
    "Import pdfs with file matching pattern."
    for source in iterview(glob(pattern)):
        if ' ' in source:
            print '[WARN] No spaces allowed in document source... renaming'
            newsource = source.replace(' ', '_')
            os.rename(source, newsource)
            source = newsource
        add.document(source=source, tags=[], interactive=False)
Пример #11
0
def delicious(xml):
    "Import links from delicious xml export. E.g. the output of delicious_import.py"
    with open(xml) as f:
        soup = BeautifulSoup(f)
        for post in iterview(soup.findAll('post')):
            print()
            add.document(source = post['href'],
                         tags = post['tag'],
                         title = post['description'],
                         notes = post['extended'],
                         interactive = False)
Пример #12
0
def pdfs(pattern):
    "Import pdfs with file matching pattern."
    for source in iterview(glob(pattern)):
        if ' ' in source:
            print('[WARN] No spaces allowed in document source... renaming')
            newsource = source.replace(' ', '_')
            os.rename(source, newsource)
            source = newsource
        add.document(source = source,
                     tags = [],
                     interactive = False)
Пример #13
0
 def f1(data, name):
     print
     print 'Phrase-based F1:', name
     f1 = F1()
     for i, x in enumerate(iterview(data)):
         predict = extract_contiguous(model(x))
         truth = extract_contiguous(x.truth)
         # (i,begin,end) uniquely identifies the span
         for (label, begins, ends) in truth:
             f1.add_relevant(label, (i, begins, ends))
         for (label, begins, ends) in predict:
             f1.add_retrieved(label, (i, begins, ends))
     print
     return f1.scores(verbose=True)
Пример #14
0
def _test_binarize_unbinarize(trees):
    from arsenal.iterextras import iterview
    for t in iterview(list(trees)):
        b = binarize(t)

        # check that tree is indeed binary
        assert check_binary_tree(b), b

        # check roundtrip 'original tree'->'binary tree'->'original tree'
        u = unbinarize(b)
        assert u == t

        # unbinarize on an unbinarized tree should do nothing (other than copy
        # the tree)
        assert unbinarize(t) == t
Пример #15
0
def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print
                print red % ('#' + '_' *len(ff))
                print red % ('#' + ff)
                print
                print ('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8')
                print ('%s: %s' % (yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8')
                print
            yield (meta, d, pdfminer(filename))
Пример #16
0
def run_test():
    [train, _] = partition(get_data('data/tagged_references.txt'), [0.01, 0.0])
    (L, A) = build_domain(train)
    crf = StringCRF(L, A)

    print('Testing gradient of log-likelihood....')
    crf.preprocess(train)
    crf.W[:] = np.random.uniform(-1, 1, size=crf.W.shape)
    test_gradient(crf, train)

    # Chekc that we have enough features to overfit this small training set.
    crf.sgd(train, iterations=10)

    llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train)
    print(f'log-likelihood {llh:g}')

    _, _, _, _, f = zip(*f1(train, 'train', crf))
    overall = 100 * np.mean(f)  # equally weighted average F1
    print(f'Overall F1 (train): {overall:.2f}')
Пример #17
0
def learn(data, test):
    labels = {x.label for x in data}
    w = {y: defaultdict(float) for y in labels}
    for t in iterview(range(10), every=1):

#        print
#        print
#        print 'Iteration', t

        alpha = 10.0 / (t + 1)**0.8
        for x in data:
            y = predict(w, x.features)
            if x.label != y:
                for k in x.features:
                    w[x.label][k] += alpha
                    w[y][k] -= alpha

#        f1('train', data, w)
#        f1('test', test, w)

    return w
Пример #18
0
def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print()
                print(colors.red % ('#' + '_' *len(ff)))
                print(colors.red % ('#' + ff))
                print()
                print(('%s: %s' % (colors.yellow % 'meta', meta['title'])).encode('utf8'))
                print(('%s: %s' % (colors.yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8'))
                print()
            try:
                yield (meta, d, pdfminer(filename))
            except Exception:
                # XXX: silently skips examples which cause pdfminer to throw an
                # exception.
                pass
Пример #19
0
def feature_label_freq_filter(data, c, threshold=5):
    for x in iterview(data, every=int(len(data)*.1)):
        y = x.label
        x.features = [k for k in x.features if c[y, k] >= threshold]
Пример #20
0
def main():
    datafile = sys.argv[1]

    train, test = traintest(datafile)
    print('train: %s, test: %s' % (len(train), len(test)))


    from scipy.sparse import dok_matrix
    from sklearn import linear_model
    from sklearn.svm import SVC
    from arsenal.alphabet import Alphabet

    N_FEATURES = 100000

    alphabet = Alphabet(random_int=N_FEATURES)

    def _f1(name, data, c, verbose=True):
        if verbose:
            print()
            print(name)
        f = F1()
        for (i, x) in enumerate(data):

            phi = dok_matrix((1, N_FEATURES))
            for k in x.features:
                phi[0, alphabet[k] % N_FEATURES] = 1.0

            [y] = c.predict(phi)
            f.report(i, y, x.label)
        f.scores(verbose=verbose)
        return f

    X = dok_matrix((len(train), N_FEATURES))

    M = len(train)

    Y = []
    X = dok_matrix((M, N_FEATURES))
    for i, x in enumerate(train):
        # binary features
        for k in x.features:
            X[i, alphabet[k] % N_FEATURES] = 1.0
        Y.append(x.label)
    X = X.tocsc()


    c = SVC(class_weight={'author': 1000,
                          'title': 1000,
                          'other': 1.0},
            verbose=1)

    c.fit(X, Y)

    _f1('train', train, c)
    ff = _f1('test', test, c, verbose=1)

    if 0:
        import numpy as np
        import matplotlib.pyplot as pl
        from mpl_toolkits.mplot3d import Axes3D
        ax = pl.figure().add_subplot(111, projection='3d')

        pl.ion()

        data = []

        for (author_weight, title_weight) in iterview(np.random.uniform(1, 10, size=(100, 2))):
            print()
            print('params:', (author_weight, title_weight))

            c = SVC(class_weight={'author': author_weight,
                                  'title': title_weight,
                                  'other': 1.0},
                    verbose=1)

            #c = linear_model.SGDClassifier()
            c.fit(X, Y)

            #_f1('train', train, c)
            ff = _f1('test', test, c, verbose=1)

            score = sum(x for (_, _, _, _, x) in ff.scores(verbose=0))

            data.append((author_weight, title_weight, score))
            print('score:', score)

            x,y,z=list(zip(*data))
            ax.clear()
            ax.scatter(x,y,z)
            ax.figure.canvas.draw()

        print('done')
        pl.ioff()
        pl.show()
Пример #21
0
def kl_filter(data,
              verbose=True,
              progress=False,
              out=sys.stdout,
              feature_label_cuttoff=0,
              feature_count_cuttoff=0,
              do_label_count=False):
    """
    data = (label, [features ...])

    KL is a synonym for Information Gain

    KL( p(label) || p(label|feature) )
    """
    (L, F, data) = integerize(data)

    if do_label_count:
        label_count = defaultdict(int)
        for label, _ in data:
            label_count[label] += 1
        label_count = label_count.items()
        label_count.sort(key=lambda x: -x[1])  # sort by count
        print 'label count'
        for k,v in label_count:
            print '%20s => %s' % (k, v)
        sys.exit(0)

    K = len(L)
    M = len(F)

    if progress:
        from arsenal.iterextras import iterview
    else:
        iterview = lambda x, *a, **kw: x

    if progress:
        print >> sys.stderr, '\nTally'

    # label-feature tally (note: we ignore dulicate features)
    counts = zeros((K,M))
    for y, fv in iterview(data):
        counts[y, fv] += 1

    feature_counts = counts.sum(axis=0)

    if feature_count_cuttoff > 0:
        cut = feature_counts < feature_count_cuttoff

        #if verbose:
        print >> sys.stderr, '%s of %s below cutoff of %s' \
            % (cut.sum(), len(feature_counts), feature_count_cuttoff)

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \
                (cut.sum(), M, cut.sum()*100.0/M)

        # zero-out features below cuttoff
        counts[:, cut] = 0

    if feature_label_cuttoff:
        cut = counts < feature_label_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \
                (cut.sum(), K*M, cut.sum()*100.0/(K*M))

        # zero-out features below cuffoff
        counts[cut] = 0

    label_prior = lidstone(counts.sum(axis=1), 0.001)  # avoids divide-by-zero

    # compute KL
    if progress:
        print >> sys.stderr, '\nKL'

    KL = zeros(M)
    for f in iterview(xrange(M)):
        label_given_f = lidstone(counts[:,f], 0.001)   # avoids divide-by-zero
        KL[f] = -kl_divergence(label_prior, label_given_f)

    # print KL-feature, most-informative first
    for i in KL.argsort():

        z = counts[:,i].sum()

        if z == 0:
            continue

        p = counts[:,i] * 1.0 / z

        l = [(v, k) for k,v in zip(L, p) if v > 0]
        l.sort()

        z = (-KL[i], F.lookup(i), l)

        if verbose:
            print >> out, '%8.6f\t%s\t%s' % (-KL[i], int(counts[:,i].sum()), F.lookup(i)), '\t\033[32m', ' '.join('%s(%.4f)' % (k,v) for v, k in l), '\033[0m'

        yield z
Пример #22
0
def ransac(data, model, n, k, t, d, debug=False):
    """
    fit model parameters to data using the RANSAC algorithm

    This implementation written from pseudocode found at
        http://en.wikipedia.org/w/index.php?title=RANSAC&oldid=116358182

    Given:
        data - a set of observed data points
        model - a model that can be fitted to data points
        n - the minimum number of data values required to fit the model
        k - the maximum number of iterations allowed in the algorithm
        t - a threshold value for determining when a data point fits a model
        d - the number of close data values required to assert that a model fits well to data
    Return:
        bestfit - model parameters which best fit the data (or None if no good model is found)
    """

    bestfit = None
    besterr = numpy.inf
    best_inlier_idxs = None

    for i in iterview(xrange(k), 250):
        # randomly partition data (random_partition returns two arrays of ids)
        maybe_idxs, test_idxs = random_partition(n, data.shape[0])

        # get data points for each id
        maybeinliers = data[maybe_idxs, :]
        test_points = data[test_idxs]

        # fit model and check error on the test points
        maybemodel = model.fit(maybeinliers)
        test_err = model.get_error(test_points, maybemodel)

        # pick indices of test_points with acceptable error (below threshold)
        also_idxs = test_idxs[test_err < t]
        alsoinliers = data[also_idxs, :]

        if debug:
            print 'test_err.min()', test_err.min()
            print 'test_err.max()', test_err.max()
            print 'numpy.mean(test_err)', numpy.mean(test_err)
            print 'iteration %d: len(alsoinliers) = %d' % (i, len(alsoinliers))

        # Do we have enough values not included in the fit-partition to assert that
        # maybemodel fits well-enough?
        if len(alsoinliers) > d:

            betterdata = numpy.concatenate((maybeinliers, alsoinliers))
            bettermodel = model.fit(betterdata)
            better_errs = model.get_error(betterdata,
                                          bettermodel)  # SSE per row
            thiserr = numpy.mean(better_errs)

            # only keep the best model, error, and data
            if thiserr < besterr:
                bestfit = bettermodel
                besterr = thiserr
                best_inlier_idxs = numpy.concatenate((maybe_idxs, also_idxs))

    if bestfit is None:
        raise ValueError("did not meet fit acceptance criteria")

    return bestfit, {'inliers': best_inlier_idxs}
Пример #23
0
def kl_filter(data, verbose=True, progress=False, out=sys.stdout):
    """
    data = (label, [features ...])

    KL is a synonym for Information Gain

    KL( p(label) || p(label|feature) )
    """
    (L, F, data) = integerize(data)

    K = len(L)
    M = len(F)

    if progress:
        from arsenal.iterextras import iterview
    else:
        iterview = lambda x, *a, **kw: x

    if progress:
        print >> sys.stderr, '\nTally'

    # label-feature tally (note: we ignore dulicate features)
    counts = zeros((K,M))
    for y, fv in iterview(data, every=5000):
        counts[y, fv] += 1

    feature_counts = counts.sum(axis=0)

    if feature_count_cuttoff:
        cut = feature_counts < feature_count_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \
                (cut.sum(), M, cut.sum()*100.0/M)

        # zero-out features below cuttoff
        counts[:, cut] = 0

    if feature_label_cuttoff:
        cut = counts < feature_label_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \
                (cut.sum(), K*M, cut.sum()*100.0/(K*M))

        # zero-out features below cuffoff
        counts[cut] = 0

    label_prior = normalize(counts.sum(axis=1))

    # compute KL
    if progress:
        print >> sys.stderr, '\nKL'

    KL = zeros(M)
    for f in iterview(xrange(M), every=5000):
        label_given_f = lidstone(counts[:,f], 0.00001)   # avoids divide-by-zero
        KL[f] = -kl_divergence(label_prior, label_given_f)

    # print KL-feature, most-informative first
    for i in KL.argsort():
        p = counts[:,i] * 1.0 / counts[:,i].sum()

        l = [(v, k) for k,v in zip(L, p) if v > 0]
        l.sort()

        z = (-KL[i], F.lookup(i), l)

        if verbose:
            print >> out, '%8.6f\t%s' % (-KL[i], F.lookup(i)), '\t\033[32m', ' '.join('%s(%s)' % (k,v) for v, k in l), '\033[0m'

        yield z
Пример #24
0
def ransac(data, model, n, k, t, d, debug=False):
    """
    fit model parameters to data using the RANSAC algorithm

    This implementation written from pseudocode found at
        http://en.wikipedia.org/w/index.php?title=RANSAC&oldid=116358182

    Given:
        data - a set of observed data points
        model - a model that can be fitted to data points
        n - the minimum number of data values required to fit the model
        k - the maximum number of iterations allowed in the algorithm
        t - a threshold value for determining when a data point fits a model
        d - the number of close data values required to assert that a model fits well to data
    Return:
        bestfit - model parameters which best fit the data (or None if no good model is found)
    """

    bestfit          = None
    besterr          = numpy.inf
    best_inlier_idxs = None

    for i in iterview(xrange(k), 250):
        # randomly partition data (random_partition returns two arrays of ids)
        maybe_idxs, test_idxs = random_partition(n, data.shape[0])

        # get data points for each id
        maybeinliers = data[maybe_idxs,:]
        test_points  = data[test_idxs]

        # fit model and check error on the test points
        maybemodel   = model.fit(maybeinliers)
        test_err     = model.get_error(test_points, maybemodel)

        # pick indices of test_points with acceptable error (below threshold)
        also_idxs    = test_idxs[test_err < t]
        alsoinliers  = data[also_idxs,:]

        if debug:
            print 'test_err.min()', test_err.min()
            print 'test_err.max()', test_err.max()
            print 'numpy.mean(test_err)', numpy.mean(test_err)
            print 'iteration %d: len(alsoinliers) = %d' % (i, len(alsoinliers))

        # Do we have enough values not included in the fit-partition to assert that
        # maybemodel fits well-enough?
        if len(alsoinliers) > d:

            betterdata  = numpy.concatenate((maybeinliers, alsoinliers))
            bettermodel = model.fit(betterdata)
            better_errs = model.get_error(betterdata, bettermodel)  # SSE per row
            thiserr     = numpy.mean(better_errs)

            # only keep the best model, error, and data
            if thiserr < besterr:
                bestfit = bettermodel
                besterr = thiserr
                best_inlier_idxs = numpy.concatenate((maybe_idxs, also_idxs))

    if bestfit is None:
        raise ValueError("did not meet fit acceptance criteria")

    return bestfit, {'inliers': best_inlier_idxs}
Пример #25
0
def main():
    datafile = sys.argv[1]

    train, test = traintest(datafile)
    print 'train: %s, test: %s' % (len(train), len(test))

    from scipy.sparse import dok_matrix
    from sklearn import linear_model
    from sklearn.svm import SVC
    from arsenal.alphabet import Alphabet

    N_FEATURES = 100000

    alphabet = Alphabet(random_int=N_FEATURES)

    def _f1(name, data, c, verbose=True):
        if verbose:
            print
            print name
        f = F1()
        for (i, x) in enumerate(data):

            phi = dok_matrix((1, N_FEATURES))
            for k in x.features:
                phi[0, alphabet[k] % N_FEATURES] = 1.0

            [y] = c.predict(phi)
            f.report(i, y, x.label)
        f.scores(verbose=verbose)
        return f

    X = dok_matrix((len(train), N_FEATURES))

    M = len(train)

    Y = []
    X = dok_matrix((M, N_FEATURES))
    for i, x in enumerate(train):
        # binary features
        for k in x.features:
            X[i, alphabet[k] % N_FEATURES] = 1.0
        Y.append(x.label)
    X = X.tocsc()

    c = SVC(class_weight={
        'author': 1000,
        'title': 1000,
        'other': 1.0
    },
            verbose=1)

    c.fit(X, Y)

    _f1('train', train, c)
    ff = _f1('test', test, c, verbose=1)

    if 0:
        import numpy as np
        import matplotlib.pyplot as pl
        from mpl_toolkits.mplot3d import Axes3D
        ax = pl.figure().add_subplot(111, projection='3d')

        pl.ion()

        data = []

        for (author_weight,
             title_weight) in iterview(np.random.uniform(1, 10,
                                                         size=(100, 2))):
            print
            print 'params:', (author_weight, title_weight)

            c = SVC(class_weight={
                'author': author_weight,
                'title': title_weight,
                'other': 1.0
            },
                    verbose=1)

            #c = linear_model.SGDClassifier()
            c.fit(X, Y)

            #_f1('train', train, c)
            ff = _f1('test', test, c, verbose=1)

            score = sum(x for (_, _, _, _, x) in ff.scores(verbose=0))

            data.append((author_weight, title_weight, score))
            print 'score:', score

            x, y, z = zip(*data)
            ax.clear()
            ax.scatter(x, y, z)
            ax.figure.canvas.draw()

        print 'done'
        pl.ioff()
        pl.show()
Пример #26
0
def feature_label_freq_filter(data, c, threshold=5):
    for x in iterview(data, every=int(len(data) * .1)):
        y = x.label
        x.features = [k for k in x.features if c[y, k] >= threshold]
Пример #27
0
def kl_filter(data, verbose=True, progress=False, out=sys.stdout):
    """
    data = (label, [features ...])

    KL is a synonym for Information Gain

    KL( p(label) || p(label|feature) )
    """
    (L, F, data) = integerize(data)

    K = len(L)
    M = len(F)

    if progress:
        from arsenal.iterextras import iterview
    else:
        iterview = lambda x, *a, **kw: x

    if progress:
        print >> sys.stderr, '\nTally'

    # label-feature tally (note: we ignore dulicate features)
    counts = zeros((K, M))
    for y, fv in iterview(data, every=5000):
        counts[y, fv] += 1

    feature_counts = counts.sum(axis=0)

    if feature_count_cuttoff:
        cut = feature_counts < feature_count_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \
                (cut.sum(), M, cut.sum()*100.0/M)

        # zero-out features below cuttoff
        counts[:, cut] = 0

    if feature_label_cuttoff:
        cut = counts < feature_label_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \
                (cut.sum(), K*M, cut.sum()*100.0/(K*M))

        # zero-out features below cuffoff
        counts[cut] = 0

    label_prior = normalize(counts.sum(axis=1))

    # compute KL
    if progress:
        print >> sys.stderr, '\nKL'

    KL = zeros(M)
    for f in iterview(xrange(M), every=5000):
        label_given_f = lidstone(counts[:, f],
                                 0.00001)  # avoids divide-by-zero
        KL[f] = -kl_divergence(label_prior, label_given_f)

    # print KL-feature, most-informative first
    for i in KL.argsort():
        p = counts[:, i] * 1.0 / counts[:, i].sum()

        l = [(v, k) for k, v in zip(L, p) if v > 0]
        l.sort()

        z = (-KL[i], F.lookup(i), l)

        if verbose:
            print >> out, '%8.6f\t%s' % (-KL[i],
                                         F.lookup(i)), '\t\033[32m', ' '.join(
                                             '%s(%s)' % (k, v)
                                             for v, k in l), '\033[0m'

        yield z