Пример #1
0
    def test_summary_sentences(self):
        # Check to see that we can break up the summary
        # back into its original sentences.

        from argos.core.brain import summarizer
        title = 'Syria Misses New Deadline as It Works to Purge Arms'
        text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.'
        expected_sents = summarizer.summarize(title, text)

        source = Source()
        source.name = 'Super Cool Times'

        article = Article(title=title, text=text, score=100)
        article.source = source
        article.ext_url = 'http://foo.com'

        self.event = Event([article])

        expected = [{
            'sentence': sent,
            'source': 'Super Cool Times',
            'url': 'http://foo.com'
        } for sent in expected_sents]

        self.assertEqual(self.event.summary_sentences, expected)
Пример #2
0
def process_events(h, clusters):
    """
    Takes clusters of node uuids and
    builds, modifies, and deletes events out of them.
    """
    now = datetime.utcnow()

    # Get existing event clusters.
    event_map = {}
    existing  = {}
    for e in Event.all_active():
        # Map event ids to their event, for lookup later.
        event_map[e.id] = e

        # Map event ids to a list of their member node ids.
        existing[e.id]  = [a.node_id for a in e.articles]

    # Figure out which events to update, delete, and create.
    to_update, to_create, to_delete, unchanged = triage(existing, clusters)

    for a_ids in to_create:
        articles = Article.query.filter(Article.node_id.in_([id.item() for id in a_ids])).order_by(Article.created_at.desc()).all()
        e = Event(articles)

        e.created_at = articles[0].created_at
        e.updated_at = articles[-1].updated_at

        rep_article = representative_article(h, a_ids, articles)
        e.title = rep_article.title
        e.image = rep_article.image

        db.session.add(e)

    for e_id, a_ids in to_update.items():
        e = event_map[e_id]
        articles = Article.query.filter(Article.node_id.in_([id.item() for id in a_ids])).all()
        e.members = articles

        rep_article = representative_article(h, a_ids, articles)
        e.title = rep_article.title
        e.image = rep_article.image

        e.update()

    # Freeze expiring events and clean up their articles from the hierarchy.
    for e_id in unchanged:
        e = event_map[e_id]
        if (now - e.updated_at).days > 3:
            e.active = False
            nodes = [h.to_iid(a.node_id) for a in e.articles]
            h.prune(nodes)

    # Do this LAST so any of this event's associated articles
    # have a chance to be moved to their new clusters (if any).
    for e_id in to_delete:
        db.session.delete(event_map[e_id])
        # does this need to prune the articles as well?
        # i think the assumption is that a deleted event's articles have all migrated elsewhere.

    db.session.commit()
Пример #3
0
    def test_event_similarity_with_cluster_different(self):
        self.prepare_event()
        members = self.prepare_articles(type='different')
        c = Event(members)

        avg_sim = self.cluster.similarity(c)
        self.assertNotEqual(avg_sim, 1.0)
        self.assertNotEqual(avg_sim, 0.0)
Пример #4
0
 def test_event_entitize(self):
     members = [
         Article(title='Robots', text='dinosaurs are cool, Reagan'),
         self.prepare_articles()[0]
     ]
     self.cluster = Event(members)
     entities = {ent.name for ent in self.cluster.entities}
     self.assertEqual(entities, {'Clinton', 'Reagan'})
Пример #5
0
    def test_story_clustering_without_matching_entities(self):
        story = fac.story()

        # Create an event with completely different entities
        # from the story.
        article = fac.article(title='The Illiad',
                              text='The Illiad has Argos in it.')
        event = Event([article])

        Story.cluster([event])
        self.assertEqual(len(story.members), 2)
        self.assertEqual(Story.query.count(), 2)
Пример #6
0
    def test_event_similarity_with_cluster_duplicates(self):
        self.prepare_event()
        members = (self.prepare_articles())
        c = Event(members)
        avg_sim = self.cluster.similarity(c)

        # Currently, the similarity calculation between clusters
        # does not yield 1.0 if they are identical clusters,
        # because we calculate the average similarity of the articles
        # between the clusters, rather than the overlap of the two clusters.
        #self.assertEqual(avg_sim, 1.0)
        self.assertAlmostEqual(avg_sim, 0.83999999999999)
Пример #7
0
    def _create_dated_story(self):
        datetime_A = datetime.utcnow() - timedelta(days=1)
        datetime_B = datetime.utcnow() - timedelta(days=5)

        article_a = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_a = Event([article_a])
        event_a.created_at = datetime_A

        article_b = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_b = Event([article_b])
        event_b.created_at = datetime_B

        article_c = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_c = Event([article_c])
        event_c.created_at = datetime_A

        story = Story([event_a, event_b, event_c])

        self.db.session.add(story)
        self.db.session.commit()

        return story, datetime_A, datetime_B
Пример #8
0
    def test_conceptize(self):
        members = [
            Article(title='Robots', text='dinosaurs are cool, Reagan'),
            self.prepare_articles()[0]
        ]
        self.event = Event(members)

        concepts = {con.slug for con in self.event.concepts}
        mentions = {ali.name for ali in self.event.mentions}

        self.assertEqual(concepts, {'Clinton', 'Reagan'})
        self.assertEqual(mentions, {'Clinton', 'Reagan'})

        # Each concept's score won't be 0.5, since
        # they are weighed down by the commonness.
        for concept in self.event.concepts:
            self.assertAlmostEqual(concept.score, 0.005, places=3)
Пример #9
0
 def test_event_timespan(self):
     text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs'
     members = [
         Article(title='A',
                 text=text,
                 created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)),
         Article(title='B',
                 text=text,
                 created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)),
         Article(title='C',
                 text=text,
                 created_at=datetime(2014, 1, 24, 1, 1, 1, 111111))
     ]
     self.cluster = Event(members)
     results = self.cluster.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111))
     self.assertEqual(len(results), 2)
     self.assertEqual({r.title for r in results}, {'B', 'C'})
Пример #10
0
    def test_event_deletion_removes_from_articles_events(self):
        articles = self.prepare_articles()
        for article in articles:
            self.db.session.add(article)

        # Make an event.
        self.event = Event(articles)
        self.db.session.add(self.event)
        self.db.session.commit()

        # The articles should reference their events.
        for article in articles:
            self.assertEqual(article.events, [self.event])

        # Destroy events.
        Event.query.delete()
        self.db.session.commit()

        # The articles should no longer have references to the events.
        for article in articles:
            self.assertEqual(article.events, [])
Пример #11
0
 def test_summarize(self):
     self.event = Event(self.prepare_articles())
     self.assertTrue(self.event.summary)
Пример #12
0
    def test_score_prefer_newer_events(self):
        event_a = Event(self.prepare_articles())
        event_b = Event(self.prepare_articles())

        self.assertGreater(event_b.score, event_a.score)
Пример #13
0
    def test_score_prefer_events_with_higher_article_scores(self):
        event_a = Event(self.prepare_articles())
        event_b = Event(self.prepare_articles(score=200))

        self.assertGreater(event_b.score, event_a.score)
Пример #14
0
 def test_event_similarity_with_object_duplicates(self):
     members = self.prepare_articles(type='duplicate')
     c = Event(members)
     avg_sim = c.similarity(self.article)
     self.assertEqual(avg_sim, 1.0)
Пример #15
0
 def prepare_event(self):
     self.cluster = Event(self.prepare_articles())
     self.db.session.add(self.cluster)
     self.db.session.commit()
Пример #16
0
 def test_event_summarize_single_article(self):
     self.cluster = Event([self.prepare_articles()[0]])
     self.assertTrue(self.cluster.summary)
Пример #17
0
 def test_event_summarize(self):
     self.cluster = Event(self.prepare_articles())
     self.assertTrue(self.cluster.summary)
Пример #18
0
 def test_event_titleize(self):
     members = [Article(title='Robots', text='dinosaurs are cool, Reagan')
                ] + self.prepare_articles(type='duplicate')
     self.cluster = Event(members)
     self.assertEqual(self.cluster.title, 'Dinosaurs')
Пример #19
0
 def test_event_entitize_no_duplicates(self):
     self.cluster = Event(self.prepare_articles())
     entities = [ent.name for ent in self.cluster.entities]
     self.assertEqual(entities, ['Clinton'])
Пример #20
0
 def prepare_event(self):
     self.event = Event(self.prepare_articles())
     self.db.session.add(self.event)
     self.db.session.commit()
Пример #21
0
 def test_conceptize_no_duplicates(self):
     self.event = Event(self.prepare_articles())
     concepts = [con.slug for con in self.event.concepts]
     mentions = [ali.name for ali in self.event.mentions]
     self.assertEqual(concepts, ['Clinton'])
     self.assertEqual(mentions, ['Clinton'])