Exemplo n.º 1
0
    def test_fontclos(self):

        # loop over SPGC books
        results = []
        books = get_books()
        for spgc_id in books.spgc_id:
            wfd = spgc_read(spgc_id)
            corpus = Corpus(wfd)

            # fit TTR curve for all & compare RMSE
            corpus.seed = SEED
            TTR = corpus.TTR
            m_tokens = TTR.m_tokens.values
            n_types = TTR.n_types.values

            # log model
            lmodel = LogModel()
            predictions_log = lmodel.fit_predict(m_tokens, n_types)
            rmse_log = RMSE_pct(n_types, predictions_log)

            # Font-Clos model
            M, N = corpus.M, corpus.N
            fmodel = FontClosModel()
            predictions_fontclos = fmodel.fit_predict(m_tokens, n_types)
            rmse_fontclos = RMSE_pct(n_types, predictions_fontclos)

            # append results
            results.append((
                spgc_id,
                corpus.M,
                corpus.N,
                lmodel.M_z,
                lmodel.N_z,
                fmodel.gamma,
                rmse_log,
                rmse_fontclos,
            ))

        # aggregate & analyze results
        results = pd.DataFrame(
            results,
            columns=[
                "id",
                "m_tokens",
                "n_types",
                "M_z",
                "N_z",
                "gamma",
                "RMSE_log",
                "RMSE_fontclos",
            ],
        ).set_index("id")

        # save results
        results.to_csv("data/fontclos.csv")

        # assert both models performance
        assert results.RMSE_log.max() < 0.01
        assert results.RMSE_fontclos.max() < 0.015
Exemplo n.º 2
0
    def test_high_dimensions(self):

        # retrieve corpus from NLTK
        filename = "melville-moby_dick.txt"
        words = gutenberg.words(filename)

        # initialize class
        corpus = Corpus(words)
        dim = 64
        corpus.dimension = dim
        corpus.seed = SEED
        TTR = corpus.TTR

        # k vector calculated correctly
        raw = corpus.k[:dim]
        agg = TTR.tail(1).values[0, 4:]
        assert all(agg == raw)

        # create & fit model
        m_tokens, n_types = TTR.m_tokens, TTR.n_types
        model = LogModel().fit(m_tokens, n_types)

        # predict E(M) & k_n(M)
        model.dimension = dim
        E_m = model.predict(corpus.M)
        k = model.predict_k(corpus.M, dim)
        assert len(k) == dim
        err = std_err(corpus.k[:dim], k)

        # visualize error
        if GRAPHICS_ON:
            import matplotlib.pyplot as plt

            plt.plot(err)
            plt.title("Error Analysis")
            plt.xlabel("n = k-vector index")
            plt.ylabel("std error")
            plt.show()
Exemplo n.º 3
0
    def test_basic(self):

        # retrieve corpus from NLTK
        filename = "blake-poems.txt"
        words = gutenberg.words(filename)

        # initialize class
        corpus = Corpus(words)

        # sanity checks
        assert set(corpus.tokens) == set(words)
        assert corpus.M == len(words)
        assert corpus.N == len(set(words))
        assert corpus.fdist.freq.sum() == corpus.M
        assert corpus.fdist.shape[0] == corpus.N
        assert corpus.k[0] == 0
        assert corpus.k[1] == len(corpus.hapax)
        assert corpus.k[2] == len(corpus.dis)
        assert corpus.k[3] == len(corpus.tris)
        assert corpus.k[4] == len(corpus.tetrakis)
        assert corpus.k[5] == len(corpus.pentakis)
        assert corpus.k[42] == len(corpus.nlegomena(42))
        assert sum(corpus.k) == corpus.N
        assert len(corpus.types) == corpus.N
        assert corpus.WFD.equals(corpus.fdist)
        assert corpus.alpha == 0.9196082282619522
        assert corpus.gamma == 1.7739943128244318
        assert corpus.as_datarow(7) == (
            8354,
            1820,
            corpus.alpha,
            corpus.gamma,
            0,
            1009,
            293,
            138,
            72,
            57,
            41,
        )

        # sample()
        assert corpus.sample(99).M == 99
        assert corpus.sample(x=0).M == 0
        assert corpus.sample(x=1).M == corpus.M
        with self.assertRaises(Exception) as context:
            corpus.sample(x=1.5)  # can't over sample with replace=False
Exemplo n.º 4
0
    def test_random_seed(self):

        # retrieve corpus from NLTK
        filename = "blake-poems.txt"
        words = gutenberg.words(filename)

        # sampling process works
        corpus = Corpus(words)
        corpus.seed = SEED
        before = corpus.TTR.n_types
        corpus = Corpus(words)
        corpus.seed = SEED
        after = corpus.TTR.n_types
        pd.testing.assert_series_equal(before, after)

        # explicit checks
        assert list(corpus.TTR.n_types[:9].values) == [
            73,
            115,
            176,
            215,
            253,
            285,
            328,
            359,
            381,
        ]
        assert corpus.sample(9).tokens == [
            '"',
            "O",
            "Tongue",
            "a",
            "fear",
            "go",
            "of",
            "the",
            "the",
        ]

        # model fitting works
        TTR = corpus.TTR
        model = LogModel()
        model.fit(TTR.m_tokens, TTR.n_types)
        before = model.params
        model.fit(TTR.m_tokens, TTR.n_types)
        after = model.params
        assert before == after
Exemplo n.º 5
0
    def test_optimization(self):

        # retrieve corpus from NLTK
        filename = "melville-moby_dick.txt"
        words = gutenberg.words(filename)

        # TODO: Why SPGC model quality suffers relative to NLTK ?

        # initialize class
        corpus = Corpus(words)
        dim = 6
        corpus.dimension = dim
        corpus.seed = SEED
        TTR = corpus.TTR

        # infer optimum sample size from observed hapax:type ratio
        hapax = corpus.k[1]
        model = LogModel().fit_naive(corpus.M, corpus.N, hapax)
        m_tokens, n_types = TTR.m_tokens, TTR.n_types

        # generate single prediction
        E_m = model.predict(corpus.M)
        k = model.predict_k(corpus.M, dim)
        assert std_err(corpus.N, E_m) < 0.0001
        assert std_err(corpus.k[1], k[1]) < 0.001
        assert std_err(corpus.k[2], k[2]) < 0.005
        assert std_err(corpus.k[3], k[3]) < 0.05
        assert std_err(corpus.k[4], k[4]) < 0.1
        assert std_err(corpus.k[5], k[5]) < 0.1

        # optimized predictions: worse fit at m=M, better fit overall
        E_m = model.predict(m_tokens)
        RMSE_before = RMSE_pct(n_types, E_m)
        E_m = model.fit_predict(m_tokens, n_types)
        RMSE_after = RMSE_pct(n_types, E_m)
        assert RMSE_after < RMSE_before

        # draw pretty pictures
        if GRAPHICS_ON:
            import matplotlib.pyplot as plt

            # predicted hapaxes
            predictions = E_m
            realization = n_types
            plt.scatter(m_tokens, realization)
            plt.plot(m_tokens, predictions, color="red")
            plt.title("Type-Token Relation (Log Formula)")
            plt.xlabel("tokens")
            plt.ylabel("types")
            plt.show()

            # predicted hapax fraction
            k = model.predict_k(m_tokens, dim)
            predictions = k[:, 1] / E_m
            realization = TTR.lego_1 / n_types
            plt.scatter(m_tokens, realization)
            plt.plot(m_tokens, predictions, color="red")
            plt.title("Hapax-Token Relation (Log Formula)")
            plt.xlabel("tokens")
            plt.ylabel("hapax fraction")
            plt.show()

            # predicted dis legomena fraction
            predictions = k[:, 2] / E_m
            realization = TTR.lego_2 / n_types
            plt.scatter(m_tokens, realization)
            plt.plot(m_tokens, predictions, color="red")
            plt.title("Dis-Token Relation (Log Formula)")
            plt.xlabel("tokens")
            plt.ylabel("dis legomena fraction")
            plt.show()
Exemplo n.º 6
0
    def test_models(self):

        # initialize class
        wfd = spgc_read("PG2701_counts.txt")  # moby dick
        corpus = Corpus(wfd)

        # build TTR curve
        corpus.seed = SEED
        TTR = corpus.TTR
        m_tokens = TTR.m_tokens.values
        n_types = TTR.n_types.values

        # fit Heap's Law model to TTR curve
        hmodel = HeapsModel().fit(m_tokens, n_types)
        predictions_heaps = hmodel.predict(m_tokens)
        H = hmodel.predict(1000)

        # infinite series
        imodel = InfSeriesModel(corpus)
        predictions_iseries = imodel.predict(m_tokens)
        I = imodel.predict(1000)

        # fit logarithmic model to TTR curve
        lmodel = LogModel().fit(m_tokens, n_types)
        predictions_log = lmodel.predict(m_tokens)
        L = lmodel.predict(1000)

        # fit Font-Clos model to TTR curve
        M, N = corpus.M, corpus.N
        fmodel = FontClosModel().fit(m_tokens, n_types)
        predictions_fontclos = lmodel.predict(m_tokens)
        F = fmodel.predict(1000)

        # explicit check
        assert (H, I, L, F) == (756, 513, 515, 770)

        # draw pretty pictures
        if GRAPHICS_ON:
            import matplotlib.pyplot as plt

            # log-log graph of Heap's Model
            plt.scatter(m_tokens, n_types)
            plt.plot(m_tokens, predictions_heaps, color="red")
            plt.title("Heap's Model (K,B) = (%0.4f, %0.4f)" % hmodel.params)
            plt.xscale("log")
            plt.yscale("log")
            plt.xlabel("log(tokens)")
            plt.ylabel("log(types)")
            plt.show()

            # normal graph of Heap's Model
            plt.scatter(m_tokens, n_types)
            plt.plot(m_tokens, predictions_heaps, color="red")
            plt.title("Heap's Model (K,B) = (%0.4f, %0.4f)" % hmodel.params)
            plt.xlabel("tokens")
            plt.ylabel("types")
            plt.show()

            # Infinite Series Model
            plt.scatter(m_tokens, n_types)
            plt.plot(m_tokens, predictions_iseries, color="red")
            plt.title("Infinite Series Model")
            plt.xlabel("tokens")
            plt.ylabel("types")
            plt.show()

            # Logarithmic Model
            plt.scatter(m_tokens, n_types)
            plt.plot(m_tokens, predictions_log, color="red")
            plt.title("Logarithmic Model (M_z, N_z) = (%s, %s)" %
                      lmodel.params)
            plt.xlabel("tokens")
            plt.ylabel("types")
            plt.show()

            # Font-Clos Model
            plt.scatter(m_tokens, n_types)
            plt.plot(m_tokens, predictions_fontclos, color="red")
            plt.title("Font-Clos Model (γ = %s)" % fmodel.gamma)
            plt.xlabel("tokens")
            plt.ylabel("types")
            plt.show()
Exemplo n.º 7
0
    def test_spgc_nltk(self):

        # NLTK-SPGC lookup
        books = get_books()

        # build corpi from each source
        corpi = {}
        for book in books.itertuples():
            title = book.nltk_id.split(".")[0]
            wfd = spgc_read(book.spgc_id)
            corpus = Corpus(wfd)
            corpi[book.spgc_id] = ("SPGC", title, corpus)
            words = gutenberg.words(book.nltk_id)
            corpus = Corpus(words)
            corpi[book.nltk_id] = ("NLTK", title, corpus)

        # fit TTR curve for all & compare RMSE
        results = []
        for corpus_id, (source, title, corpus) in corpi.items():
            corpus.seed = SEED
            TTR = corpus.TTR
            m_tokens = TTR.m_tokens.values
            n_types = TTR.n_types.values

            # log model
            model = LogModel()
            predictions = model.fit_predict(m_tokens, n_types)
            rmse_log = RMSE_pct(n_types, predictions)

            # iseries model
            predictions = InfSeriesModel(corpus).predict(m_tokens)
            rmse_iseries = RMSE_pct(n_types, predictions)

            # heaps model
            predictions = HeapsModel().fit_predict(m_tokens, n_types)
            rmse_heaps = RMSE_pct(n_types, predictions)

            # append results
            results.append((
                corpus_id,
                title,
                source,
                corpus.M,
                corpus.N,
                model.M_z,
                model.N_z,
                rmse_log,
                rmse_iseries,
                rmse_heaps,
            ))

        # aggregate & analyze results
        results = pd.DataFrame(
            results,
            columns=[
                "id",
                "title",
                "source",
                "m_tokens",
                "n_types",
                "M_z",
                "N_z",
                "RMSE_log",
                "RMSE_iseries",
                "RMSE_heaps",
            ],
        ).set_index("id")

        # save results to data/books.csv
        results.to_csv("data/books.csv")

        # assert log model outperforms heaps
        assert results.RMSE_log.max() < 0.01
        assert results.RMSE_heaps.min() > 0.008