class UrlNGram:
    def __init__(self, urls, n=2):
        self.ngram = MLE(n)
        train_data, padded_sents = padded_everygram_pipeline(n, urls)
        self.ngram.fit(train_data, padded_sents)

    def get_entropy(self, url):
        return self.ngram.entropy(list(url))

    def get_perplexity(self, url):
        return self.ngram.perplexity(list(url))
示例#2
0
class MleBigramTests(unittest.TestCase):
    """unit tests for MLENgramModel class"""

    score_tests = [
        ("d", ["c"], 1),
        # Unseen ngrams should yield 0
        ("d", ["e"], 0),
        # Unigrams should also be 0
        ("z", None, 0),
        # N unigrams = 14
        # count('a') = 2
        ("a", None, 2.0 / 14),
        # count('y') = 3
        ("y", None, 3.0 / 14),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = MLE(2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_logscore_zero_score(self):
        # logscore of unseen ngrams should be -inf
        logscore = self.model.logscore("d", ["e"])

        self.assertTrue(math.isinf(logscore))

    def test_entropy_perplexity_seen(self):
        # ngrams seen during training
        trained = [
            ("<s>", "a"),
            ("a", "b"),
            ("b", "<UNK>"),
            ("<UNK>", "a"),
            ("a", "d"),
            ("d", "</s>"),
        ]
        # Ngram = Log score
        # <s>, a    = -1
        # a, b      = -1
        # b, UNK    = -1
        # UNK, a    = -1.585
        # a, d      = -1
        # d, </s>   = -1
        # TOTAL logscores   = -6.585
        # - AVG logscores   = 1.0975
        H = 1.0975
        perplexity = 2.1398

        self.assertAlmostEqual(H, self.model.entropy(trained), places=4)
        self.assertAlmostEqual(perplexity,
                               self.model.perplexity(trained),
                               places=4)

    def test_entropy_perplexity_unseen(self):
        # In MLE, even one unseen ngram should make entropy and perplexity infinite
        untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]

        self.assertTrue(math.isinf(self.model.entropy(untrained)))
        self.assertTrue(math.isinf(self.model.perplexity(untrained)))

    def test_entropy_perplexity_unigrams(self):
        # word = score, log score
        # <s>   = 0.1429, -2.8074
        # a     = 0.1429, -2.8074
        # c     = 0.0714, -3.8073
        # UNK   = 0.2143, -2.2224
        # d     = 0.1429, -2.8074
        # c     = 0.0714, -3.8073
        # </s>  = 0.1429, -2.8074
        # TOTAL logscores = -21.6243
        # - AVG logscores = 3.0095
        H = 3.0095
        perplexity = 8.0529

        text = [("<s>", ), ("a", ), ("c", ), ("-", ), ("d", ), ("c", ),
                ("</s>", )]

        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity,
                               self.model.perplexity(text),
                               places=4)
示例#3
0
class TestMleBigram(metaclass=ParametrizedTests):
    """Unit tests for MLE ngram model."""

    score_tests = [
        ("d", ["c"], 1),
        # Unseen ngrams should yield 0
        ("d", ["e"], 0),
        # Unigrams should also be 0
        ("z", None, 0),
        # N unigrams = 14
        # count('a') = 2
        ("a", None, 2.0 / 14),
        # count('y') = 3
        ("y", None, 3.0 / 14),
    ]

    @classmethod
    def setup_method(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = MLE(2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_logscore_zero_score(self):
        # logscore of unseen ngrams should be -inf
        logscore = self.model.logscore("d", ["e"])
        assert math.isinf(logscore)

    def test_entropy_perplexity_seen(self):
        # ngrams seen during training
        trained = [
            ("<s>", "a"),
            ("a", "b"),
            ("b", "<UNK>"),
            ("<UNK>", "a"),
            ("a", "d"),
            ("d", "</s>"),
        ]
        # Ngram = Log score
        # <s>, a    = -1
        # a, b      = -1
        # b, UNK    = -1
        # UNK, a    = -1.585
        # a, d      = -1
        # d, </s>   = -1
        # TOTAL logscores   = -6.585
        # - AVG logscores   = 1.0975
        H = 1.0975
        perplexity = 2.1398
        assert pytest.approx(self.model.entropy(trained), 1e-4) == H
        assert pytest.approx(self.model.perplexity(trained), 1e-4) == perplexity

    def test_entropy_perplexity_unseen(self):
        # In MLE, even one unseen ngram should make entropy and perplexity infinite
        untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]

        assert math.isinf(self.model.entropy(untrained))
        assert math.isinf(self.model.perplexity(untrained))

    def test_entropy_perplexity_unigrams(self):
        # word = score, log score
        # <s>   = 0.1429, -2.8074
        # a     = 0.1429, -2.8074
        # c     = 0.0714, -3.8073
        # UNK   = 0.2143, -2.2224
        # d     = 0.1429, -2.8074
        # c     = 0.0714, -3.8073
        # </s>  = 0.1429, -2.8074
        # TOTAL logscores = -21.6243
        # - AVG logscores = 3.0095
        H = 3.0095
        perplexity = 8.0529

        text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)]

        assert pytest.approx(self.model.entropy(text), 1e-4) == H
        assert pytest.approx(self.model.perplexity(text), 1e-4) == perplexity
示例#4
0
class MleBigramTests(unittest.TestCase):
    """unit tests for MLENgramModel class"""

    score_tests = [
        ("d", ["c"], 1),
        # Unseen ngrams should yield 0
        ("d", ["e"], 0),
        # Unigrams should also be 0
        ("z", None, 0),
        # N unigrams = 14
        # count('a') = 2
        ("a", None, 2.0 / 14),
        # count('y') = 3
        ("y", None, 3.0 / 14),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = MLE(2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_logscore_zero_score(self):
        # logscore of unseen ngrams should be -inf
        logscore = self.model.logscore("d", ["e"])

        self.assertTrue(math.isinf(logscore))

    def test_entropy_perplexity_seen(self):
        # ngrams seen during training
        trained = [
            ("<s>", "a"),
            ("a", "b"),
            ("b", "<UNK>"),
            ("<UNK>", "a"),
            ("a", "d"),
            ("d", "</s>"),
        ]
        # Ngram = Log score
        # <s>, a    = -1
        # a, b      = -1
        # b, UNK    = -1
        # UNK, a    = -1.585
        # a, d      = -1
        # d, </s>   = -1
        # TOTAL logscores   = -6.585
        # - AVG logscores   = 1.0975
        H = 1.0975
        perplexity = 2.1398

        self.assertAlmostEqual(H, self.model.entropy(trained), places=4)
        self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4)

    def test_entropy_perplexity_unseen(self):
        # In MLE, even one unseen ngram should make entropy and perplexity infinite
        untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]

        self.assertTrue(math.isinf(self.model.entropy(untrained)))
        self.assertTrue(math.isinf(self.model.perplexity(untrained)))

    def test_entropy_perplexity_unigrams(self):
        # word = score, log score
        # <s>   = 0.1429, -2.8074
        # a     = 0.1429, -2.8074
        # c     = 0.0714, -3.8073
        # UNK   = 0.2143, -2.2224
        # d     = 0.1429, -2.8074
        # c     = 0.0714, -3.8073
        # </s>  = 0.1429, -2.8074
        # TOTAL logscores = -21.6243
        # - AVG logscores = 3.0095
        H = 3.0095
        perplexity = 8.0529

        text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)]

        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
示例#5
0
class FeatureEmbeddings:
    def __init__(self):
        self.features = pd.DataFrame()
        n = 2
        self.bigram = MLE(n)

    def __URLsplit(self, s):
        return [char for char in s]

    def __buildBigram(self, urls):
        train_data, padded_sents = padded_everygram_pipeline(2, urls)
        self.bigram.fit(train_data, padded_sents)

    def __cleanURL(self, url):
        xtract = tldextract.extract(url)
        return '.'.join(xtract)

    def __editDistance(self, url):
        popular_sites = [
            'https://news.yahoo.com/',
            'https://news.google.com/?hl=en-US&gl=US&ceid=US:en',
            'https://www.huffpost.com/', 'https://www.cnn.com/',
            'https://www.nytimes.com/', 'https://www.foxnews.com/',
            'https://www.nbcnews.com/',
            'https://www.dailymail.co.uk/ushome/index.html',
            'https://www.washingtonpost.com/',
            'https://www.theguardian.com/us', 'https://www.wsj.com/',
            'https://abcnews.go.com/', 'https://www.bbc.co.uk/news',
            'https://www.usatoday.com/', 'https://www.latimes.com/'
        ]
        popular_sites = [self.__cleanURL(str(x)) for x in popular_sites]
        dist = float('inf')
        for site in popular_sites:
            new_dist = editdistance.eval(url, site)
            if new_dist < dist:
                dist = new_dist
        return dist

    def __htmlInfo(self, urls):
        n = len(urls)
        status_codes = [-1] * n
        is_active = [0] * n
        has_wp_content = [-1] * n
        num_iframes = [-1] * n
        it = -1
        for url in tqdm(urls):
            it += 1
            try:
                response = requests.get(url, timeout=10)
                status_codes[it] = response.status_code

                if response.status_code == 200:
                    page = bs4.BeautifulSoup(response.text, 'lxml')
                    is_active[it] = 1
                    iframes = page.find_all(name='iframe')
                    num_iframes[it] = len(iframes)
                    has_wp_content[it] = 1 if response.text.find(
                        'wp-content') > -1 else 0
            except:
                continue
        self.features['status'] = status_codes
        self.features['active'] = is_active
        self.features['wp_content'] = has_wp_content
        self.features['num_iframes'] = num_iframes

    def __cleanHeadline(self, h):
        return remove_stopwords(
            strip_punctuation(strip_numeric(str(h).lower()))).split(' ')

    def __get_val(self, v, row, i):
        if v[row] == []:
            return 0.0
        else:
            return float(v[row][i])

    def __headerEmbeddings(self, headers):
        header_model = Word2Vec.load("models/headline_word_embeddings.model")
        head_vecs = []
        for h in headers:
            h = self.__cleanHeadline(h)
            h = [x for x in h if x in header_model.wv.vocab]
            if len(h) >= 1:
                head_vecs.append(np.mean(header_model[h], axis=0))
            else:
                head_vecs.append([])
        for i in range(len(head_vecs[0])):
            self.features.insert(i, 'h_vec_' + str(i), [
                self.__get_val(head_vecs, row, i)
                for row in range(len(head_vecs))
            ], True)

    def __articleEmbeddings(self, articles):
        #         sys.path.append(os.getcwd())
        global embed
        doc_model = Doc2Vec(vector_size=100,
                            window=10,
                            min_count=2,
                            epochs=100)
        doc_model = Doc2Vec.load("models/my_doc2vec_model")
        a_vec_labels = []
        for i in range(0, 100):
            a_vec_labels.append('a_vec_' + str(i))
        vecs = []

        #         loop = tqdm(total=len(articles), position=0) # the little progress bar thing

        # do the embedding itself.
        def embed(article):
            t = str(article[1]).split()
            e = list(doc_model.infer_vector(t))
            #             vecs.append(e)
            return e

        #prep some basic paralellism, to make use of the supercomputer's many cores.
        pool = multiprocessing.Pool()
        jobs = []

        for x in tqdm(pool.imap(embed, enumerate(articles)),
                      total=len(articles)):
            vecs.append(x)

        pool.close()
        pool.join()
        #         for i, text in enumerate(articles):
        #             # update progress bar
        #             loop.set_description('Inferring vector for article number: '+str(i))
        #             loop.update(1)
        #             # add this bad boy to the pool of our many embeddings to make
        #             p = multiprocessing.Process(target = embed, args=((i, text),))
        #             jobs.append(p)
        #             p.start()

        # there must be an end to all this parallel madness
        #         for p in jobs:
        #             p.join()

        # report the results, we have finished.
        a_embeds = pd.DataFrame(vecs, columns=a_vec_labels)
        self.features = a_embeds.join(self.features)

    def create(self, data, url_col, article_col, header_col=None):
        '''
        Creates feature dataset from news article URL
        Features:
          BUILT:
            TRANSFERRED:
              - bigram entropy
              - bigram perplexity
              - clean bigram entropy
              - clean bigram perplexity
              - edit distance to top 15 site
              - status
              - active
              - has wordpress content
              - number of iframes
            NEW:
              - header embeddings
          TO BE BUILT:
            - article embeddings
            - url embeddings
        '''
        # HEADLINE VECTORS
        if header_col:
            sys.stdout.write('Building embeddings for headlines...\n')
            self.__headerEmbeddings(data[header_col])

        if url_col is not None:
            # BIGRAM ENTROPY & PERPLEXITY
            sys.stdout.write(
                'Building bigram model features for URL strings...\n')
            urls = data[url_col].apply(lambda a: str(a))
            split_urls = urls.apply(lambda a: self.__URLsplit(a))
            self.__buildBigram(split_urls)
            self.features['bigram_entropy'] = [
                self.bigram.entropy(x) for x in urls
            ]
            self.features['bigram_perplexity'] = [
                self.bigram.perplexity(x) for x in urls
            ]

            # CLEAN BIGRAM ENTROPY & PERPLEXITY
            clean_urls = urls.apply(lambda a: self.__cleanURL(str(a)))
            split_clean_urls = clean_urls.apply(lambda a: self.__URLsplit(a))
            self.__buildBigram(split_clean_urls)
            self.features['clean_bigram_entropy'] = [
                self.bigram.entropy(x) for x in split_clean_urls
            ]
            self.features['clean_bigram_perplexity'] = [
                self.bigram.perplexity(x) for x in split_clean_urls
            ]

            # EDIT DISTANCE
            sys.stdout.write(
                'Calculating edit distance for each URL string...\n')
            self.features['edit_distance'] = [
                self.__editDistance(x) for x in clean_urls
            ]

            # HTML INFO (STATUS, ACTIVE, WP CONTENT, # IFRAMES)
            #sys.stdout.write('Accessing request info for features...\n')
            #self.__htmlInfo(urls)

        # ARTICLE EMBEDDINGS VIA DOC2VEC
        sys.stdout.write('Inferring article embeddings via doc2vec...\n')
        self.__articleEmbeddings(data[article_col])
        sys.stdout.flush()