Пример #1
0
 def dont_geotag_if_detail_exists(self, nominatim):
     gkg = Gkg(
         id=3771256,
         gkgrecordid="20170215174500-2503",
         date=20170215174500,
         document_identifier=
         "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
     )
     self.session.add(gkg)
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     fact = Fact(unit='person', term='displaced')
     self.session.add(fact)
     self.session.commit()
     loc1 = self.session.query(Location).filter(
         Location.location_name == 'India').one_or_none()
     fact.locations.append(loc1)
     analysis.facts.append(fact)
     self.session.commit()
     process_locations(analysis)
     assert not nominatim.called
Пример #2
0
def homepage():
    session = Session()
    try:
        articles = session.query(Analysis).order_by(desc(
            Analysis.updated)).limit(10).all()
        counts = Analysis.status_counts(session)
        cat_counts = Analysis.category_counts(session)
        return render_template('index.html',
                               articles=articles,
                               counts=counts,
                               cat_counts=cat_counts)
    finally:
        session.close()
Пример #3
0
 def test_extract_eviction_facts(self):
     """Extracts eviction-related facts with eviction Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "ordered eviction for 2000 people from their homes in Bosnia")
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)
Пример #4
0
 def test_extract_facts_simple(self):
     """Extracts simple facts when present and saves to DB"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit the area and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(1, len(analysis.facts))
Пример #5
0
 def test_extract_refugee_facts(self):
     """Extracts refugee-related facts with Refugee Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee."
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)
Пример #6
0
 def test_extract_sacked_facts(self):
     """Extracts sacked-related facts with eviction Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "last week 2000 people have been sacked from their homes in Nigeria"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)
Пример #7
0
 def test_create_duplicate_fact(self):
     """Creates duplicate fact if locations from multiple countries exist"""
     gkg = Gkg(
         id=3771256,
         gkgrecordid="20170215174500-2503",
         date=20170215174500,
         document_identifier=
         "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
     )
     self.session.add(gkg)
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     self.session.commit()
     fact = Fact(unit='person', term='displaced')
     self.session.add(fact)
     self.session.commit()
     loc1 = self.session.query(Location).filter(
         Location.location_name == 'India').one_or_none()
     loc2 = self.session.query(Location).filter(
         Location.location_name == 'Pakistan').one_or_none()
     fact.locations.append(loc1)
     fact.locations.append(loc2)
     analysis.facts.append(fact)
     self.session.commit()
     self.assertEqual(1, len(analysis.facts))
     process_locations(analysis)
     self.assertEqual(2, len(analysis.facts))
     fact_countries = [f.iso3 for f in analysis.facts]
     self.assertIn('IND', fact_countries)
     self.assertIn('PAK', fact_countries)
     self.assertEqual(1, len(analysis.facts[0].locations))
     self.assertEqual(1, len(analysis.facts[1].locations))
Пример #8
0
    def work(self):
        """
        Look for Documents in the given session Return for which no Analysis exists and
        creates one with Status.New. Returns True iff some Analyses were created
        """
        # start a new session for each job
        session = Session()
        try:
            # Get a Document
            # ... for which no Analysis exists
            # ... and lock it for updates
            # ... sort by created date
            # ... pick the first (oldest)
            gkgs = session.query(Gkg) \
                .filter(~session.query(Analysis).filter(Gkg.id == Analysis.gkg_id).exists()) \
                .with_for_update() \
                .order_by(Gkg.date) \
                .limit(1000).all()
            if len(gkgs) == 0:
                return False  # no work to be done
            for gkg in gkgs:
                analysis = Analysis(gkg=gkg, status=Status.NEW)
                session.add(analysis)
                session.commit()
                logger.info(
                    "Worker {} created Analysis {} in status {}".format(
                        os.getpid(), analysis.gkg_id, analysis.status))
        finally:
            # make sure to release a FOR UPDATE lock, if we got one
            if session is not None:
                session.rollback()
                session.close()

        return True
Пример #9
0
 def test_use_existing_location(self):
     """Uses existing locations when they exist"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses"
     )
     self.session.add(content)
     location = Location(location_name='Bosnia')
     self.session.add(location)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     fact = analysis.facts[0]
     extracted_location = fact.locations[0]
     self.assertEqual(location.id, extracted_location.id)
Пример #10
0
 def test_scrape_pdf(self):
     gkg = Gkg(
         document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf")
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     self.session.commit()
     scrape(analysis)
     content = analysis.content
     self.assertEqual("pdf", content.content_type)
     self.assertTrue("Katrina" in content.content)
     self.assertTrue("Louisiana" in content.content)
     self.assertTrue("\n" not in content.content)
Пример #11
0
def create_new_analysis_from_url(session, url):
    scn = get_scn_from_url(url)
    now = datetime.datetime.now()
    gkg_date = ('{:04d}{:02d}{:02d}{:02d}{:02d}{:02d}'.format(
        now.year, now.month, now.day, now.hour, now.minute, now.second))
    article = Gkg(document_identifier=url,
                  date=gkg_date,
                  source_common_name=scn)
    analysis = Analysis(gkg=article, status=Status.NEW, retrieval_attempts=0)
    session.add(analysis)
    session.commit()
    return analysis
Пример #12
0
 def test_create_locations_with_names(self):
     """Creates locations for facts only with location names"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     facts = analysis.facts
     self.assertEqual(1, len(facts))
     fact = facts[0]
     self.assertEqual(2, len(fact.locations))
     loc_names = [loc.location_name for loc in fact.locations]
     self.assertIn('London', loc_names)
     self.assertIn('Middlesex', loc_names)
     self.assertEqual([None, None], [loc.country for loc in fact.locations])
Пример #13
0
 def test_scrape_html(self):
     gkg = Gkg(
         document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html")
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     self.session.commit()
     scrape(analysis)
     content = analysis.content
     self.assertEqual("text", content.content_type)
     self.assertTrue("Katrina" in content.content_clean)
     self.assertTrue("Louisiana" in content.content_clean)
     self.assertTrue("\n" not in content.content_clean)
     self.assertTrue(content.content_ts is not None)
     matches = (
         self.session.query(DocumentContent)
             .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all()
     )
     self.assertIn(content, matches)
Пример #14
0
 def test_fail_if_geotagging_fails(self, nominatim):
     """Location processing should fail if geotagging fails"""
     nominatim.side_effect = GeotagException()
     gkg = Gkg(
         id=3771256,
         gkgrecordid="20170215174500-2503",
         date=20170215174500,
         document_identifier=
         "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
     )
     self.session.add(gkg)
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     self.session.commit()
     fact = Fact(unit='person', term='displaced')
     self.session.add(fact)
     self.session.commit()
     loc1 = Location(location_name="Ruislip")
     fact.locations.append(loc1)
     analysis.facts.append(fact)
     self.session.commit()
     with self.assertRaises(GeotagException):
         process_locations(analysis)
Пример #15
0
    def test_status_update(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)
        self.assertEqual(analysis.status, Status.SCRAPING)

        # meanwhile, some other process changed the status of this...
        session2 = Session()
        try:
            other = session2.query(Analysis).get(analysis.gkg_id)
            other.create_new_version(Status.SCRAPING_FAILED)
        finally:
            session2.rollback()

        with self.assertRaises(NotLatestException):
            analysis.create_new_version(Status.SCRAPED)
Пример #16
0
    def test_version_lifecycle(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)

        history = self.session.query(AnalysisHistory).filter(
            AnalysisHistory.gkg == gkg)
        self.assertEqual(1, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())

        content = DocumentContent(content_type="text/html",
                                  content="Lorem ipsum")
        analysis.content = content
        analysis.create_new_version(Status.SCRAPED)

        self.assertEqual(2, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())

        analysis.create_new_version(Status.EXTRACTING)

        self.assertEqual(3, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())

        # content is preserved
        scraped = history.filter(
            AnalysisHistory.status == Status.SCRAPED).one_or_none()
        self.assertEqual(analysis.content, scraped.content)

        fact = Fact(analysis_date=datetime.now())
        analysis.facts = [fact]
        analysis.create_new_version(Status.EXTRACTED)

        self.assertEqual(4, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())

        # content still preserved
        extracting = history.filter(
            AnalysisHistory.status == Status.EXTRACTING).one_or_none()
        self.assertEqual(analysis.content, extracting.content)

        analysis.create_new_version(Status.EDITING)
        analysis.content = DocumentContent(content_type="text/html",
                                           content="Lorem edited")
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(6, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITING).count())

        # content has changed, but reports are preserved
        extracted = history.filter(
            AnalysisHistory.status == Status.EXTRACTED).one_or_none()
        self.assertNotEqual(analysis.content.id, extracted.content.id)
        self.assertCountEqual([f.id for f in analysis.facts],
                              [f.id for f in extracted.facts])

        analysis.create_new_version(Status.EDITING)
        fact2 = Fact(analysis_date=datetime.now())
        analysis.facts.append(fact2)
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(8, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            2,
            history.filter(AnalysisHistory.status == Status.EDITING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITED).count())

        edited = history.filter(
            AnalysisHistory.status == Status.EDITED).one_or_none()
        self.assertCountEqual([f.id for f in analysis.facts],
                              [fact.id, fact2.id])
        self.assertCountEqual([f.id for f in edited.facts], [fact.id])
Пример #17
0
    def test_status_counts(self):
        gkgs = self.session.query(Gkg).all()[:2]
        analysis1 = Analysis(gkg=gkgs[0], status=Status.NEW)
        self.session.add(analysis1)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {Status.NEW: 1})

        analysis1.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 1})

        analysis2 = Analysis(gkg=gkgs[1], status=Status.NEW)
        self.session.add(analysis2)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.NEW: 1,
            Status.SCRAPING: 1
        })

        analysis2.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 2})

        analysis2.create_new_version(Status.SCRAPED)

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.SCRAPED: 1,
            Status.SCRAPING: 1
        })