示例#1
0
    def test_summary_sentences(self):
        # Check to see that we can break up the summary
        # back into its original sentences.

        from argos.core.brain import summarizer
        title = 'Syria Misses New Deadline as It Works to Purge Arms'
        text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.'
        expected_sents = summarizer.summarize(title, text)

        source = Source()
        source.name = 'Super Cool Times'

        article = Article(title=title, text=text, score=100)
        article.source = source
        article.ext_url = 'http://foo.com'

        self.event = Event([article])

        expected = [{
            'sentence': sent,
            'source': 'Super Cool Times',
            'url': 'http://foo.com'
        } for sent in expected_sents]

        self.assertEqual(self.event.summary_sentences, expected)
示例#2
0
    def test_event_no_matching_cluster_creates_new_cluster(self):
        article = Article(title='Superstars',
                          text='superstars are awesome, Clinton',
                          created_at=datetime.utcnow())
        Event.cluster([article])

        self.assertEqual(Event.query.count(), 1)
示例#3
0
    def test_event_clusters_similar(self):
        self.prepare_event()
        members = self.prepare_articles(type='duplicate')
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 3)
示例#4
0
 def test_event_does_not_cluster_not_similar(self):
     self.prepare_event()
     article = Article(title='Superstars',
                       text='superstars are awesome, Clinton',
                       created_at=datetime.utcnow())
     Event.cluster([article])
     self.assertEqual(len(self.cluster.members), 2)
示例#5
0
    def test_event_clusters_similar(self):
        self.prepare_event()
        members = self.prepare_articles(type='duplicate')
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 3)
示例#6
0
 def test_event_entitize(self):
     members = [
         Article(title='Robots', text='dinosaurs are cool, Reagan'),
         self.prepare_articles()[0]
     ]
     self.cluster = Event(members)
     entities = {ent.name for ent in self.cluster.entities}
     self.assertEqual(entities, {'Clinton', 'Reagan'})
示例#7
0
 def test_event_does_not_cluster_not_similar(self):
     self.prepare_event()
     article = Article(
             title='Superstars',
             text='superstars are awesome, Clinton',
             created_at=datetime.utcnow()
     )
     Event.cluster([article])
     self.assertEqual(len(self.cluster.members), 2)
示例#8
0
    def test_event_no_matching_cluster_creates_new_cluster(self):
        article = Article(
                title='Superstars',
                text='superstars are awesome, Clinton',
                created_at=datetime.utcnow()
        )
        Event.cluster([article])

        self.assertEqual(Event.query.count(), 1)
示例#9
0
    def test_event_does_not_cluster_if_no_shared_entities(self):
        self.prepare_event()
        members = [
            Article(title='Robots',
                    text='dinosaurs are cool, Reagan',
                    created_at=datetime.utcnow())
        ]
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 1)
示例#10
0
    def test_event_does_not_cluster_if_no_shared_entities(self):
        self.prepare_event()
        members = [Article(
            title='Robots',
            text='dinosaurs are cool, Reagan',
            created_at=datetime.utcnow()
        )]
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 1)
示例#11
0
 def test_event_titleize(self):
     members = [Article(
         title='Robots',
         text='dinosaurs are cool, Reagan'
     )] + self.prepare_articles(type='duplicate')
     self.cluster = Event(members)
     self.assertEqual(self.cluster.title, 'Dinosaurs')
示例#12
0
    def test_summary_sentences(self):
        # Check to see that we can break up the summary
        # back into its original sentences.

        from argos.core.brain import summarizer
        title = 'Syria Misses New Deadline as It Works to Purge Arms'
        text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.'
        expected_sents = summarizer.summarize(title, text)

        source = Source()
        source.name = 'Super Cool Times'

        article = Article(
                        title=title,
                        text=text,
                        score=100)
        article.source = source
        article.ext_url = 'http://foo.com'

        self.event=Event([article])

        expected = [{
            'sentence': sent,
            'source': 'Super Cool Times',
            'url': 'http://foo.com'
        } for sent in expected_sents]

        self.assertEqual(self.event.summary_sentences, expected)
示例#13
0
    def test_conceptize(self):
        members = [
            Article(title='Robots', text='dinosaurs are cool, Reagan'),
            self.prepare_articles()[0]
        ]
        self.event = Event(members)

        concepts = {con.slug for con in self.event.concepts}
        mentions = {ali.name for ali in self.event.mentions}

        self.assertEqual(concepts, {'Clinton', 'Reagan'})
        self.assertEqual(mentions, {'Clinton', 'Reagan'})

        # Each concept's score won't be 0.5, since
        # they are weighed down by the commonness.
        for concept in self.event.concepts:
            self.assertAlmostEqual(concept.score, 0.005, places=3)
示例#14
0
 def test_event_entitize(self):
     members = [Article(
         title='Robots',
         text='dinosaurs are cool, Reagan'
     ), self.prepare_articles()[0]]
     self.cluster = Event(members)
     entities = {ent.name for ent in self.cluster.entities}
     self.assertEqual(entities, {'Clinton', 'Reagan'})
示例#15
0
    def test_event_similarity_with_cluster_different(self):
        self.prepare_event()
        members = self.prepare_articles(type='different')
        c = Event(members)

        avg_sim = self.cluster.similarity(c)
        self.assertNotEqual(avg_sim, 1.0)
        self.assertNotEqual(avg_sim, 0.0)
示例#16
0
 def test_event_timespan(self):
     text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs'
     members = [
         Article(title='A',
                 text=text,
                 created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)),
         Article(title='B',
                 text=text,
                 created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)),
         Article(title='C',
                 text=text,
                 created_at=datetime(2014, 1, 24, 1, 1, 1, 111111))
     ]
     self.cluster = Event(members)
     results = self.cluster.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111))
     self.assertEqual(len(results), 2)
     self.assertEqual({r.title for r in results}, {'B', 'C'})
示例#17
0
 def test_timespan(self):
     text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs'
     members = [
             Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)),
             Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)),
             Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111))
     ]
     self.event = Event(members)
     results = self.event.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111))
     self.assertEqual(len(results), 2)
     self.assertEqual({r.title for r in results}, {'B', 'C'})
示例#18
0
    def test_event_deletion_removes_from_articles_events(self):
        articles = self.prepare_articles()
        for article in articles:
            self.db.session.add(article)

        # Make an event.
        self.event = Event(articles)
        self.db.session.add(self.event)
        self.db.session.commit()

        # The articles should reference their events.
        for article in articles:
            self.assertEqual(article.events, [self.event])

        # Destroy events.
        Event.query.delete()
        self.db.session.commit()

        # The articles should no longer have references to the events.
        for article in articles:
            self.assertEqual(article.events, [])
示例#19
0
    def test_story_clustering_without_matching_entities(self):
        story = fac.story()

        # Create an event with completely different entities
        # from the story.
        article = fac.article(title='The Illiad',
                              text='The Illiad has Argos in it.')
        event = Event([article])

        Story.cluster([event])
        self.assertEqual(len(story.members), 2)
        self.assertEqual(Story.query.count(), 2)
示例#20
0
    def test_event_similarity_with_cluster_duplicates(self):
        self.prepare_event()
        members = (self.prepare_articles())
        c = Event(members)
        avg_sim = self.cluster.similarity(c)

        # Currently, the similarity calculation between clusters
        # does not yield 1.0 if they are identical clusters,
        # because we calculate the average similarity of the articles
        # between the clusters, rather than the overlap of the two clusters.
        #self.assertEqual(avg_sim, 1.0)
        self.assertAlmostEqual(avg_sim, 0.83999999999999)
示例#21
0
    def _create_dated_story(self):
        datetime_A = datetime.utcnow() - timedelta(days=1)
        datetime_B = datetime.utcnow() - timedelta(days=5)

        article_a = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_a = Event([article_a])
        event_a.created_at = datetime_A

        article_b = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_b = Event([article_b])
        event_b.created_at = datetime_B

        article_c = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_c = Event([article_c])
        event_c.created_at = datetime_A

        story = Story([event_a, event_b, event_c])

        self.db.session.add(story)
        self.db.session.commit()

        return story, datetime_A, datetime_B
示例#22
0
    def test_conceptize(self):
        members = [Article(
            title='Robots',
            text='dinosaurs are cool, Reagan'
        ), self.prepare_articles()[0]]
        self.event = Event(members)

        concepts = {con.slug for con in self.event.concepts}
        mentions = {ali.name for ali in self.event.mentions}

        self.assertEqual(concepts, {'Clinton', 'Reagan'})
        self.assertEqual(mentions, {'Clinton', 'Reagan'})

        # Each concept's score won't be 0.5, since
        # they are weighed down by the commonness.
        for concept in self.event.concepts:
            self.assertAlmostEqual(concept.score, 0.005, places=3)
示例#23
0
    def test_event_deletion_removes_from_articles_events(self):
        articles = self.prepare_articles()
        for article in articles:
            self.db.session.add(article)

        # Make an event.
        self.event = Event(articles)
        self.db.session.add(self.event)
        self.db.session.commit()

        # The articles should reference their events.
        for article in articles:
            self.assertEqual(article.events, [self.event])

        # Destroy events.
        Event.query.delete()
        self.db.session.commit()

        # The articles should no longer have references to the events.
        for article in articles:
            self.assertEqual(article.events, [])
示例#24
0
 def prepare_event(self):
     self.event = Event(self.prepare_articles())
     self.db.session.add(self.event)
     self.db.session.commit()
示例#25
0
文件: __init__.py 项目: keho98/argos
def seed(debug=False):
    this_dir = os.path.dirname(__file__)
    seeds = open(os.path.join(this_dir, 'seed.json'), 'r')
    sources = open(os.path.join(this_dir, 'seed_sources.json'), 'r')

    sample_images = [
        'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg'
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg'
    ]

    print('Resetting the database...')
    db.drop_all()
    db.create_all()

    # Create sources
    print('Creating sources...')
    for url in json.load(sources):
        s = Source(ext_url=url, name='The Times') # fake name
        db.session.add(s)
    db.session.commit()
    num_sources = Source.query.count()
    print('Created {0} sources.'.format(num_sources))

    # Create articles
    entries = json.load(seeds)
    print('Seeding {0} articles...'.format(len(entries)))
    articles = []
    for entry in entries:
        if debug:
            print(json.dumps(entry, sort_keys=True, indent=4))

        source = Source.query.filter_by(ext_url=entry['source']).first()

        a = Article(
                ext_url=entry['url'],
                source=source,
                html=entry['html'],
                text=entry['text'],
                tags=entry['tags'],
                title=entry['title'],
                created_at = parse(entry['published']),
                updated_at = parse(entry['updated']),
                image=random.choice(sample_images) # fake image
        )
        articles.append(a)
        db.session.add(a)

        progress_bar(len(articles) / len(entries) * 100)

    db.session.commit()

    num_articles = Article.query.count()
    num_entities = Entity.query.count()
    print('Seeded {0} articles.'.format(num_articles))
    print('Found {0} entities.'.format(num_entities))

    print('Clustering articles into events...')
    Event.cluster(articles, threshold=0.02, debug=True)
    num_events = Event.query.count()
    print('Created {0} event clusters.'.format(num_events))

    print('Clustering events into stories...')
    events = Event.query.all()
    Story.cluster(events, threshold=0.02, debug=True)
    num_stories = Story.query.count()
    print('Created {0} story clusters.'.format(num_stories))

    print('\n\n==============================================')
    print('From {0} sources, seeded {1} articles, found {2} entities, created {3} events and {4} stories.'.format(num_sources, num_articles, num_entities, num_events, num_stories))
    print('==============================================\n\n')
示例#26
0
 def test_event_summarize(self):
     self.cluster = Event(self.prepare_articles())
     self.assertTrue(self.cluster.summary)
示例#27
0
 def prepare_event(self):
     self.cluster = Event(self.prepare_articles())
     self.db.session.add(self.cluster)
     self.db.session.commit()
示例#28
0
 def test_event_similarity_with_object_duplicates(self):
     members = self.prepare_articles(type='duplicate')
     c = Event(members)
     avg_sim = c.similarity(self.article)
     self.assertEqual(avg_sim, 1.0)
示例#29
0
 def prepare_event(self):
     self.cluster = Event(self.prepare_articles())
     self.db.session.add(self.cluster)
     self.db.session.commit()
示例#30
0
 def test_event_summarize_single_article(self):
     self.cluster = Event([self.prepare_articles()[0]])
     self.assertTrue(self.cluster.summary)
示例#31
0
 def test_summarize_single_article(self):
     self.event = Event([self.prepare_articles()[0]])
     self.assertTrue(self.event.summary)
示例#32
0
 def test_summarize(self):
     self.event = Event(self.prepare_articles())
     self.assertTrue(self.event.summary)
示例#33
0
 def test_conceptize_no_duplicates(self):
     self.event = Event(self.prepare_articles())
     concepts = [con.slug for con in self.event.concepts]
     mentions = [ali.name for ali in self.event.mentions]
     self.assertEqual(concepts, ['Clinton'])
     self.assertEqual(mentions, ['Clinton'])
示例#34
0
class EventTest(RequiresDatabase):
    """
    Note this tests the abstract Cluster class's methods as well.
    A Cluster instance can't be instantiated since it is abstract,
    so we use the Event as a testing proxy.
    """
    patch_knowledge = True

    def setUp(self):
        self.article = self.prepare_articles()[0]

    def prepare_articles(self, type='standard', score=100):
        a = {
            'title': 'Dinosaurs',
            'text': 'dinosaurs are cool, Clinton',
            'score': score
        }
        b = {
            'title': 'Robots',
            'text': 'robots are nice, Clinton',
            'score': score
        }
        c = {
            'title': 'Robots',
            'text': 'papa was a rodeo, Clinton',
            'score': score
        }

        if type == 'standard':
            articles = [Article(**a), Article(**b)]
        elif type == 'duplicate':
            articles = [Article(**a), Article(**a)]
        elif type == 'different':
            articles = [Article(**a), Article(**c)]

        # Need to save these articles to persist concepts,
        # so that their overlaps are calculated properly when clustering!
        for article in articles:
            self.db.session.add(article)
        self.db.session.commit()

        return articles

    def prepare_event(self):
        self.event = Event(self.prepare_articles())
        self.db.session.add(self.event)
        self.db.session.commit()

    def test_event_deletion_removes_from_articles_events(self):
        articles = self.prepare_articles()
        for article in articles:
            self.db.session.add(article)

        # Make an event.
        self.event = Event(articles)
        self.db.session.add(self.event)
        self.db.session.commit()

        # The articles should reference their events.
        for article in articles:
            self.assertEqual(article.events, [self.event])

        # Destroy events.
        Event.query.delete()
        self.db.session.commit()

        # The articles should no longer have references to the events.
        for article in articles:
            self.assertEqual(article.events, [])

    def test_conceptize(self):
        members = [
            Article(title='Robots', text='dinosaurs are cool, Reagan'),
            self.prepare_articles()[0]
        ]
        self.event = Event(members)

        concepts = {con.slug for con in self.event.concepts}
        mentions = {ali.name for ali in self.event.mentions}

        self.assertEqual(concepts, {'Clinton', 'Reagan'})
        self.assertEqual(mentions, {'Clinton', 'Reagan'})

        # Each concept's score won't be 0.5, since
        # they are weighed down by the commonness.
        for concept in self.event.concepts:
            self.assertAlmostEqual(concept.score, 0.005, places=3)

    def test_conceptize_no_duplicates(self):
        self.event = Event(self.prepare_articles())
        concepts = [con.slug for con in self.event.concepts]
        mentions = [ali.name for ali in self.event.mentions]
        self.assertEqual(concepts, ['Clinton'])
        self.assertEqual(mentions, ['Clinton'])

    def test_summarize(self):
        self.event = Event(self.prepare_articles())
        self.assertTrue(self.event.summary)

    def test_summarize_single_article(self):
        self.event = Event([self.prepare_articles()[0]])
        self.assertTrue(self.event.summary)

    def test_summary_sentences(self):
        # Check to see that we can break up the summary
        # back into its original sentences.

        from argos.core.brain import summarizer
        title = 'Syria Misses New Deadline as It Works to Purge Arms'
        text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.'
        expected_sents = summarizer.summarize(title, text)

        source = Source()
        source.name = 'Super Cool Times'

        article = Article(title=title, text=text, score=100)
        article.source = source
        article.ext_url = 'http://foo.com'

        self.event = Event([article])

        expected = [{
            'sentence': sent,
            'source': 'Super Cool Times',
            'url': 'http://foo.com'
        } for sent in expected_sents]

        self.assertEqual(self.event.summary_sentences, expected)

    def test_timespan(self):
        text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs'
        members = [
            Article(title='A',
                    text=text,
                    created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)),
            Article(title='B',
                    text=text,
                    created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)),
            Article(title='C',
                    text=text,
                    created_at=datetime(2014, 1, 24, 1, 1, 1, 111111))
        ]
        self.event = Event(members)
        results = self.event.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111))
        self.assertEqual(len(results), 2)
        self.assertEqual({r.title for r in results}, {'B', 'C'})

    def test_score_prefer_newer_events(self):
        event_a = Event(self.prepare_articles())
        event_b = Event(self.prepare_articles())

        self.assertGreater(event_b.score, event_a.score)

    def test_score_prefer_events_with_higher_article_scores(self):
        event_a = Event(self.prepare_articles())
        event_b = Event(self.prepare_articles(score=200))

        self.assertGreater(event_b.score, event_a.score)
示例#35
0
class EventTest(RequiresDatabase):
    """
    Note this tests the abstract Cluster class's methods as well.
    A Cluster instance can't be instantiated since it is abstract,
    so we use the Event as a testing proxy.
    """
    patch_knowledge = True

    def setUp(self):
        self.article = self.prepare_articles()[0]

    def prepare_articles(self, type='standard', score=100):
        a = {'title':'Dinosaurs', 'text':'dinosaurs are cool, Clinton', 'score':score}
        b = {'title':'Robots', 'text':'robots are nice, Clinton', 'score':score}
        c = {'title':'Robots', 'text':'papa was a rodeo, Clinton', 'score':score}

        if type == 'standard':
            articles = [Article(**a), Article(**b)]
        elif type == 'duplicate':
            articles = [Article(**a), Article(**a)]
        elif type == 'different':
            articles = [Article(**a), Article(**c)]

        # Need to save these articles to persist concepts,
        # so that their overlaps are calculated properly when clustering!
        for article in articles:
            self.db.session.add(article)
        self.db.session.commit()

        return articles

    def prepare_event(self):
        self.event = Event(self.prepare_articles())
        self.db.session.add(self.event)
        self.db.session.commit()

    def test_event_deletion_removes_from_articles_events(self):
        articles = self.prepare_articles()
        for article in articles:
            self.db.session.add(article)

        # Make an event.
        self.event = Event(articles)
        self.db.session.add(self.event)
        self.db.session.commit()

        # The articles should reference their events.
        for article in articles:
            self.assertEqual(article.events, [self.event])

        # Destroy events.
        Event.query.delete()
        self.db.session.commit()

        # The articles should no longer have references to the events.
        for article in articles:
            self.assertEqual(article.events, [])

    def test_conceptize(self):
        members = [Article(
            title='Robots',
            text='dinosaurs are cool, Reagan'
        ), self.prepare_articles()[0]]
        self.event = Event(members)

        concepts = {con.slug for con in self.event.concepts}
        mentions = {ali.name for ali in self.event.mentions}

        self.assertEqual(concepts, {'Clinton', 'Reagan'})
        self.assertEqual(mentions, {'Clinton', 'Reagan'})

        # Each concept's score won't be 0.5, since
        # they are weighed down by the commonness.
        for concept in self.event.concepts:
            self.assertAlmostEqual(concept.score, 0.005, places=3)

    def test_conceptize_no_duplicates(self):
        self.event = Event(self.prepare_articles())
        concepts = [con.slug for con in self.event.concepts]
        mentions = [ali.name for ali in self.event.mentions]
        self.assertEqual(concepts, ['Clinton'])
        self.assertEqual(mentions, ['Clinton'])

    def test_summarize(self):
        self.event = Event(self.prepare_articles())
        self.assertTrue(self.event.summary)

    def test_summarize_single_article(self):
        self.event = Event([self.prepare_articles()[0]])
        self.assertTrue(self.event.summary)

    def test_summary_sentences(self):
        # Check to see that we can break up the summary
        # back into its original sentences.

        from argos.core.brain import summarizer
        title = 'Syria Misses New Deadline as It Works to Purge Arms'
        text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.'
        expected_sents = summarizer.summarize(title, text)

        source = Source()
        source.name = 'Super Cool Times'

        article = Article(
                        title=title,
                        text=text,
                        score=100)
        article.source = source
        article.ext_url = 'http://foo.com'

        self.event=Event([article])

        expected = [{
            'sentence': sent,
            'source': 'Super Cool Times',
            'url': 'http://foo.com'
        } for sent in expected_sents]

        self.assertEqual(self.event.summary_sentences, expected)

    def test_timespan(self):
        text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs'
        members = [
                Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)),
                Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)),
                Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111))
        ]
        self.event = Event(members)
        results = self.event.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111))
        self.assertEqual(len(results), 2)
        self.assertEqual({r.title for r in results}, {'B', 'C'})

    def test_score_prefer_newer_events(self):
        event_a = Event(self.prepare_articles())
        event_b = Event(self.prepare_articles())

        self.assertGreater(event_b.score, event_a.score)

    def test_score_prefer_events_with_higher_article_scores(self):
        event_a = Event(self.prepare_articles())
        event_b = Event(self.prepare_articles(score=200))

        self.assertGreater(event_b.score, event_a.score)
示例#36
0
 def test_event_titleize(self):
     members = [Article(title='Robots', text='dinosaurs are cool, Reagan')
                ] + self.prepare_articles(type='duplicate')
     self.cluster = Event(members)
     self.assertEqual(self.cluster.title, 'Dinosaurs')
示例#37
0
 def test_event_summarize(self):
     self.cluster = Event(self.prepare_articles())
     self.assertTrue(self.cluster.summary)
示例#38
0
 def prepare_event(self):
     self.event = Event(self.prepare_articles())
     self.db.session.add(self.event)
     self.db.session.commit()
示例#39
0
class EventTest(RequiresApp):
    """
    Note this tests the abstract Cluster class's methods as well.
    A Cluster instance can't be instantiated since it is abstract,
    so we use the Event as a testing proxy.
    """
    def setUp(self):
        self.article = self.prepare_articles()[0]

    def prepare_articles(self, type='standard'):
        a = {'title':'Dinosaurs', 'text':'dinosaurs are cool, Clinton'}
        b = {'title':'Robots', 'text':'robots are nice, Clinton'}
        c = {'title':'Robots', 'text':'papa was a rodeo, Clinton'}

        if type == 'standard':
            articles = [Article(**a), Article(**b)]
        elif type == 'duplicate':
            articles = [Article(**a), Article(**a)]
        elif type == 'different':
            articles = [Article(**a), Article(**c)]

        # Need to save these articles to persist entities,
        # so that their overlaps are calculated properly when clustering!
        for article in articles:
            self.db.session.add(article)
        self.db.session.commit()

        return articles

    def prepare_event(self):
        self.cluster = Event(self.prepare_articles())
        self.db.session.add(self.cluster)
        self.db.session.commit()

    def test_event_similarity_with_object_different(self):
        self.prepare_event()
        avg_sim = self.cluster.similarity(self.article)
        self.assertNotEqual(avg_sim, 1.0)
        self.assertNotEqual(avg_sim, 0.0)

    def test_event_similarity_with_object_duplicates(self):
        members = self.prepare_articles(type='duplicate')
        c = Event(members)
        avg_sim = c.similarity(self.article)
        self.assertEqual(avg_sim, 1.0)

    def test_event_similarity_with_cluster_duplicates(self):
        self.prepare_event()
        members = (self.prepare_articles())
        c = Event(members)
        avg_sim = self.cluster.similarity(c)

        # Currently, the similarity calculation between clusters
        # does not yield 1.0 if they are identical clusters,
        # because we calculate the average similarity of the articles
        # between the clusters, rather than the overlap of the two clusters.
        #self.assertEqual(avg_sim, 1.0)
        self.assertAlmostEqual(avg_sim, 0.83999999999999)

    def test_event_similarity_with_cluster_different(self):
        self.prepare_event()
        members = self.prepare_articles(type='different')
        c = Event(members)

        avg_sim = self.cluster.similarity(c)
        self.assertNotEqual(avg_sim, 1.0)
        self.assertNotEqual(avg_sim, 0.0)

    def test_event_expired_made_inactive(self):
        self.prepare_event()
        self.cluster.updated_at = datetime.utcnow() - timedelta(days=4)
        Event.cluster([self.article])
        self.assertFalse(self.cluster.active)

    def test_event_clusters_similar(self):
        self.prepare_event()
        members = self.prepare_articles(type='duplicate')
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 3)

    def test_event_does_not_cluster_if_no_shared_entities(self):
        self.prepare_event()
        members = [Article(
            title='Robots',
            text='dinosaurs are cool, Reagan',
            created_at=datetime.utcnow()
        )]
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 1)

    def test_event_does_not_cluster_not_similar(self):
        self.prepare_event()
        article = Article(
                title='Superstars',
                text='superstars are awesome, Clinton',
                created_at=datetime.utcnow()
        )
        Event.cluster([article])
        self.assertEqual(len(self.cluster.members), 2)

    def test_event_no_matching_cluster_creates_new_cluster(self):
        article = Article(
                title='Superstars',
                text='superstars are awesome, Clinton',
                created_at=datetime.utcnow()
        )
        Event.cluster([article])

        self.assertEqual(Event.query.count(), 1)

    def test_event_entitize(self):
        members = [Article(
            title='Robots',
            text='dinosaurs are cool, Reagan'
        ), self.prepare_articles()[0]]
        self.cluster = Event(members)
        entities = {ent.name for ent in self.cluster.entities}
        self.assertEqual(entities, {'Clinton', 'Reagan'})

    def test_event_entitize_no_duplicates(self):
        self.cluster = Event(self.prepare_articles())
        entities = [ent.name for ent in self.cluster.entities]
        self.assertEqual(entities, ['Clinton'])

    def test_event_titleize(self):
        members = [Article(
            title='Robots',
            text='dinosaurs are cool, Reagan'
        )] + self.prepare_articles(type='duplicate')
        self.cluster = Event(members)
        self.assertEqual(self.cluster.title, 'Dinosaurs')

    def test_event_summarize(self):
        self.cluster = Event(self.prepare_articles())
        self.assertTrue(self.cluster.summary)

    def test_event_summarize_single_article(self):
        self.cluster = Event([self.prepare_articles()[0]])
        self.assertTrue(self.cluster.summary)

    def test_event_timespan(self):
        text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs'
        members = [
                Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)),
                Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)),
                Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111))
        ]
        self.cluster = Event(members)
        results = self.cluster.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111))
        self.assertEqual(len(results), 2)
        self.assertEqual({r.title for r in results}, {'B', 'C'})
示例#40
0
def process_events(h, clusters):
    """
    Takes clusters of node uuids and
    builds, modifies, and deletes events out of them.
    """
    now = datetime.utcnow()

    # Get existing event clusters.
    event_map = {}
    existing  = {}
    for e in Event.all_active():
        # Map event ids to their event, for lookup later.
        event_map[e.id] = e

        # Map event ids to a list of their member node ids.
        existing[e.id]  = [a.node_id for a in e.articles]

    # Figure out which events to update, delete, and create.
    to_update, to_create, to_delete, unchanged = triage(existing, clusters)

    for a_ids in to_create:
        articles = Article.query.filter(Article.node_id.in_([id.item() for id in a_ids])).order_by(Article.created_at.desc()).all()
        e = Event(articles)

        e.created_at = articles[0].created_at
        e.updated_at = articles[-1].updated_at

        rep_article = representative_article(h, a_ids, articles)
        e.title = rep_article.title
        e.image = rep_article.image

        db.session.add(e)

    for e_id, a_ids in to_update.items():
        e = event_map[e_id]
        articles = Article.query.filter(Article.node_id.in_([id.item() for id in a_ids])).all()
        e.members = articles

        rep_article = representative_article(h, a_ids, articles)
        e.title = rep_article.title
        e.image = rep_article.image

        e.update()

    # Freeze expiring events and clean up their articles from the hierarchy.
    for e_id in unchanged:
        e = event_map[e_id]
        if (now - e.updated_at).days > 3:
            e.active = False
            nodes = [h.to_iid(a.node_id) for a in e.articles]
            h.prune(nodes)

    # Do this LAST so any of this event's associated articles
    # have a chance to be moved to their new clusters (if any).
    for e_id in to_delete:
        db.session.delete(event_map[e_id])
        # does this need to prune the articles as well?
        # i think the assumption is that a deleted event's articles have all migrated elsewhere.

    db.session.commit()
示例#41
0
 def test_event_similarity_with_object_duplicates(self):
     members = self.prepare_articles(type='duplicate')
     c = Event(members)
     avg_sim = c.similarity(self.article)
     self.assertEqual(avg_sim, 1.0)
示例#42
0
    def test_score_prefer_events_with_higher_article_scores(self):
        event_a = Event(self.prepare_articles())
        event_b = Event(self.prepare_articles(score=200))

        self.assertGreater(event_b.score, event_a.score)
示例#43
0
class EventTest(RequiresApp):
    """
    Note this tests the abstract Cluster class's methods as well.
    A Cluster instance can't be instantiated since it is abstract,
    so we use the Event as a testing proxy.
    """
    def setUp(self):
        self.article = self.prepare_articles()[0]

    def prepare_articles(self, type='standard'):
        a = {'title': 'Dinosaurs', 'text': 'dinosaurs are cool, Clinton'}
        b = {'title': 'Robots', 'text': 'robots are nice, Clinton'}
        c = {'title': 'Robots', 'text': 'papa was a rodeo, Clinton'}

        if type == 'standard':
            articles = [Article(**a), Article(**b)]
        elif type == 'duplicate':
            articles = [Article(**a), Article(**a)]
        elif type == 'different':
            articles = [Article(**a), Article(**c)]

        # Need to save these articles to persist entities,
        # so that their overlaps are calculated properly when clustering!
        for article in articles:
            self.db.session.add(article)
        self.db.session.commit()

        return articles

    def prepare_event(self):
        self.cluster = Event(self.prepare_articles())
        self.db.session.add(self.cluster)
        self.db.session.commit()

    def test_event_similarity_with_object_different(self):
        self.prepare_event()
        avg_sim = self.cluster.similarity(self.article)
        self.assertNotEqual(avg_sim, 1.0)
        self.assertNotEqual(avg_sim, 0.0)

    def test_event_similarity_with_object_duplicates(self):
        members = self.prepare_articles(type='duplicate')
        c = Event(members)
        avg_sim = c.similarity(self.article)
        self.assertEqual(avg_sim, 1.0)

    def test_event_similarity_with_cluster_duplicates(self):
        self.prepare_event()
        members = (self.prepare_articles())
        c = Event(members)
        avg_sim = self.cluster.similarity(c)

        # Currently, the similarity calculation between clusters
        # does not yield 1.0 if they are identical clusters,
        # because we calculate the average similarity of the articles
        # between the clusters, rather than the overlap of the two clusters.
        #self.assertEqual(avg_sim, 1.0)
        self.assertAlmostEqual(avg_sim, 0.83999999999999)

    def test_event_similarity_with_cluster_different(self):
        self.prepare_event()
        members = self.prepare_articles(type='different')
        c = Event(members)

        avg_sim = self.cluster.similarity(c)
        self.assertNotEqual(avg_sim, 1.0)
        self.assertNotEqual(avg_sim, 0.0)

    def test_event_expired_made_inactive(self):
        self.prepare_event()
        self.cluster.updated_at = datetime.utcnow() - timedelta(days=4)
        Event.cluster([self.article])
        self.assertFalse(self.cluster.active)

    def test_event_clusters_similar(self):
        self.prepare_event()
        members = self.prepare_articles(type='duplicate')
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 3)

    def test_event_does_not_cluster_if_no_shared_entities(self):
        self.prepare_event()
        members = [
            Article(title='Robots',
                    text='dinosaurs are cool, Reagan',
                    created_at=datetime.utcnow())
        ]
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 1)

    def test_event_does_not_cluster_not_similar(self):
        self.prepare_event()
        article = Article(title='Superstars',
                          text='superstars are awesome, Clinton',
                          created_at=datetime.utcnow())
        Event.cluster([article])
        self.assertEqual(len(self.cluster.members), 2)

    def test_event_no_matching_cluster_creates_new_cluster(self):
        article = Article(title='Superstars',
                          text='superstars are awesome, Clinton',
                          created_at=datetime.utcnow())
        Event.cluster([article])

        self.assertEqual(Event.query.count(), 1)

    def test_event_entitize(self):
        members = [
            Article(title='Robots', text='dinosaurs are cool, Reagan'),
            self.prepare_articles()[0]
        ]
        self.cluster = Event(members)
        entities = {ent.name for ent in self.cluster.entities}
        self.assertEqual(entities, {'Clinton', 'Reagan'})

    def test_event_entitize_no_duplicates(self):
        self.cluster = Event(self.prepare_articles())
        entities = [ent.name for ent in self.cluster.entities]
        self.assertEqual(entities, ['Clinton'])

    def test_event_titleize(self):
        members = [Article(title='Robots', text='dinosaurs are cool, Reagan')
                   ] + self.prepare_articles(type='duplicate')
        self.cluster = Event(members)
        self.assertEqual(self.cluster.title, 'Dinosaurs')

    def test_event_summarize(self):
        self.cluster = Event(self.prepare_articles())
        self.assertTrue(self.cluster.summary)

    def test_event_summarize_single_article(self):
        self.cluster = Event([self.prepare_articles()[0]])
        self.assertTrue(self.cluster.summary)

    def test_event_timespan(self):
        text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs'
        members = [
            Article(title='A',
                    text=text,
                    created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)),
            Article(title='B',
                    text=text,
                    created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)),
            Article(title='C',
                    text=text,
                    created_at=datetime(2014, 1, 24, 1, 1, 1, 111111))
        ]
        self.cluster = Event(members)
        results = self.cluster.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111))
        self.assertEqual(len(results), 2)
        self.assertEqual({r.title for r in results}, {'B', 'C'})
示例#44
0
    def test_score_prefer_newer_events(self):
        event_a = Event(self.prepare_articles())
        event_b = Event(self.prepare_articles())

        self.assertGreater(event_b.score, event_a.score)
示例#45
0
 def test_event_expired_made_inactive(self):
     self.prepare_event()
     self.cluster.updated_at = datetime.utcnow() - timedelta(days=4)
     Event.cluster([self.article])
     self.assertFalse(self.cluster.active)
示例#46
0
 def test_summarize(self):
     self.event = Event(self.prepare_articles())
     self.assertTrue(self.event.summary)
示例#47
0
 def test_event_entitize_no_duplicates(self):
     self.cluster = Event(self.prepare_articles())
     entities = [ent.name for ent in self.cluster.entities]
     self.assertEqual(entities, ['Clinton'])
示例#48
0
 def test_event_expired_made_inactive(self):
     self.prepare_event()
     self.cluster.updated_at = datetime.utcnow() - timedelta(days=4)
     Event.cluster([self.article])
     self.assertFalse(self.cluster.active)
示例#49
0
 def test_event_entitize_no_duplicates(self):
     self.cluster = Event(self.prepare_articles())
     entities = [ent.name for ent in self.cluster.entities]
     self.assertEqual(entities, ['Clinton'])
示例#50
0
 def test_conceptize_no_duplicates(self):
     self.event = Event(self.prepare_articles())
     concepts = [con.slug for con in self.event.concepts]
     mentions = [ali.name for ali in self.event.mentions]
     self.assertEqual(concepts, ['Clinton'])
     self.assertEqual(mentions, ['Clinton'])