def test_date_count_table(self): doc = AnnoDoc(""" Cumulative case data Report date / Cases / Deaths / New cases per week 26 Jun 2017 / 190 / 10 / 8 Sep 2017 / 300 / 12 / 9 Sep 2017 / 309 / 13 / 15 Sep 2017 / 319 / 14 / 6 Oct 2017 / 376 / 14 / 13 Oct 2017 / 20 Oct 2017 / 431 / 17 / 34 27 Oct 2017 / 457 / 18 / 26 3 Nov 2017 / 486 / 19 / 29""") doc.add_tier(self.annotator) metadatas = [ remove_empty_props(span.metadata) for span in doc.tiers['structured_incidents'] ] self.assertEqual(metadatas[-1], { 'value': 29, 'type': 'caseCount', 'attributes': [], 'dateRange': [ datetime.datetime(2017, 10, 28), datetime.datetime(2017, 11, 4)] }) self.assertEqual(metadatas[-2], { 'value': 19, 'type': 'cumulativeDeathCount', 'attributes': [], 'dateRange': [ datetime.datetime(2017, 11, 3), datetime.datetime(2017, 11, 4)] })
def test_since_date_2(self): doc = AnnoDoc("Since April 6th 2013, 21 cases of infection have been confirmed.", date=datetime.datetime(2014, 12, 10)) doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(2013, 4, 6), datetime.datetime(2014, 12, 10)])
def test_vietnamese(self): # Normally this should be spelled Cao Bằng, but I want to test # that the ascii version works. doc = AnnoDoc(u"At Cao Bang, Vietnam 5 cases were recorded.") doc.add_tier(self.annotator) self.assertTrue(doc.tiers['geonames'].spans[0].geoname['geonameid'] in ['1586182', '1586185'])
def test_missing_count_bug(self): doc = AnnoDoc(""" State / Number of Cases Alabama / 25 Arizona / 6 Arkansas / 9 California / 54 Colorado / 18 N Dakota / 1 S Dakota / 1 Connecticut / 9 """) doc.add_tier(self.annotator) locations = [span.metadata['location'] for span in doc.tiers['structured_incidents']] geonameids = [ location['geonameid'] if isinstance(location, dict) else location for location in locations] self.assertEqual(geonameids, [ '4829764', '5551752', '4099753', '5332921', '5417618', '5690763', '5769223', '4831725'])
def test_dateparse_bug_2(self): # The current version of the date annotator tries to parse 72\n1994, which triggers an exception # in the dateparse library. doc = AnnoDoc(""" Year Cases Fatal 1991 46,320 697\n1992 31,870 208\n1993 6,833 72\n1994 1,785 16\n1995 2,160 23""") doc.add_tier(self.annotator)
def test_unknown_species_and_space_delimited_counts(self): doc = AnnoDoc(""" The epidemiological statistics accumulated since the start of the event are included in the following "outbreak summary": Species / Susceptible / Cases / Deaths / Killed and disposed of / Slaughtered Birds / 6 368 632 / 1 303 173 / 1 297 617 / 3 850 608 / 0 Black-crowned night-heron / not available / 1 / 1 / 0 / 0 Passeridae (unidentified) / not available / 2 / 2 / 0 / 0 Pale thrush / not available / 1 / 1 / 0 / 0 """) doc.add_tier(self.annotator) metadatas = [ remove_empty_props(span.metadata) for span in doc.tiers['structured_incidents'] ] self.assertEqual(metadatas[0], { 'attributes': [], 'type': 'caseCount', 'value': 1303173, 'species': {'id': 'tsn:174371', 'label': 'Aves'} }) self.assertEqual(metadatas[-1], { 'attributes': [], 'type': 'deathCount', 'value': 1, 'species': "Cannot parse" })
def test_unusual_format(self): doc = AnnoDoc(""" For subscribers' convenience, we hereby reproduce Israel's annual rabies statistics since 2014: Year // badger / cat / fox / jackal / wolf / dog / cattle / sheep / horse // total 2014 // 3 / 0 / 2 / 2 / 4 / 2 / 1 / 0 / 0 // 14 2015 // 12 / 1 / 1 / 3 / 0 / 1 / 7 / 0 / 1 // 20 2016 // 12 / 0 / 7 / 5 / 0 / 0 / 5 / 0 / 1 // 30 2017 // 10 / 2 / 0 / 47 / 0 / 0 / 14 / 1 / 0 // 74 2018 // 4 / 0 / 0 / 35 / 0 / 1 / 7 / 1 / 1 // 51 """) doc.add_tier(self.annotator) metadatas = [ remove_empty_props(span.metadata) for span in doc.tiers['structured_incidents'] ] # A value from row one is not used because 2014 is missed by the date # parser although other years are caught. # The index refers to the badgers in 2015. It is an unintuitive index # because some species are not being parsed so their values are skipped. self.assertEqual(metadatas[2]['type'], 'caseCount') self.assertEqual(metadatas[2]['value'], 12) self.assertEqual(metadatas[2]['species']['label'], 'Taxidea taxus') self.assertEqual(metadatas[2]['dateRange'], [ datetime.datetime(2015, 1, 1, 0, 0), datetime.datetime(2016, 1, 1, 0, 0)])
def test_multipart_names_2(self): text = 'I used to live in Seattle, Washington, USA' doc = AnnoDoc(text) doc.add_tier(self.annotator) self.assertEqual(doc.text, text) self.assertEqual(len(doc.tiers['geonames'].spans), 1) self.assertEqual(doc.tiers['geonames'].spans[0].text, "Seattle, Washington, USA")
def test_partial_year_range(self): doc = AnnoDoc("From 1912-17 some stuff happened.") doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(1912, 1, 1), datetime.datetime(1918, 1, 1)])
def test_non_cumulative_total(self): doc = AnnoDoc(""" Between 1 January 2011 - 22 Dec 2014, a total of 103 confirmed cases of hantavirus have been reported in Antarctica. """) doc.add_tier(self.annotator) self.assertEqual(doc.tiers['incidents'][0].metadata['type'], 'caseCount')
def test_chicago(self): text = 'I went to Chicago.' doc = AnnoDoc(text) doc.add_tier(self.annotator) self.assertEqual(doc.text, text) self.assertEqual(len(doc.tiers['geonames'].spans), 1) self.assertEqual(doc.tiers['geonames'].spans[0].text, "Chicago") self.assertTrue(doc.tiers['geonames'].spans[0].label.endswith("Chicago")) self.assertEqual(doc.tiers['geonames'].spans[0].start, 10) self.assertEqual(doc.tiers['geonames'].spans[0].end, 17) geoname = doc.tiers['geonames'].spans[0].metadata['geoname'].to_dict() del geoname['name_count'] del geoname['score'] del geoname['population'] self.assertEqual(geoname, { 'admin1_code': u'IL', 'admin1_name': u'Illinois', 'admin2_code': u'031', 'admin2_name': u'Cook County', 'admin3_code': u'14000', 'admin3_name': u'City of Chicago', 'admin4_code': u'', 'asciiname': u'Chicago', 'country_code': u'US', 'country_name': u'United States', 'feature_code': u'PPLA2', 'geonameid': u'4887398', 'latitude': 41.85003, 'longitude': -87.65005, 'name': u'Chicago', 'names_used': u'Chicago', 'parents': []})
def test_sentence_segmentation(self): doc = AnnoDoc(""" 2 cases on 26 Dec 2014 1- Riyadh, Saudi Arabia: 31-year-old Saudi female, non-healthcare worker, currently in critical condition 2- Qurayyat, Saudi Arabia: 70-year-old Saudi male, non-healthcare worker, currently in stable condition 1 case on 19 Dec 2014 Alkharj: 53-year-old Saudi male, non-healthcare worker, history of animal exposure, no history of contact with suspected or confirmed cases in the healthcare environment or in the community 2 cases on 25 Dec 2014 Alkharj: 53-year-old Saudi male, non-healthcare worker, history of pre-existing co-morbidities Taif: 70-year-old Saudi female, non-healthcare worker, history of pre-existing co-morbidities 1 case on 22 Dec 2014 Taif: 29-year-old Expat female, healthcare worker, no history of co-morbidities """) doc.add_tier(self.annotator) self.assertEqual( len(doc.tiers['incidents'].spans[0].metadata['locations']), 2) self.assertEqual( len(doc.tiers['incidents'].spans[-1].metadata['locations']), 1)
def test_disease_scope(self): doc = AnnoDoc(""" POLIOMYELITIS UPDATE: ***************************************************************************** Poliovirus Weekly Update 26 Sep 2018, WHO ----------------------------------------- New wild poliovirus cases reported this week: 0 Total number of wild poliovirus cases in 2018: 18 Total number of wild poliovirus cases in 2017: 22 New cVDPV cases reported this week: 10 Total number of cVDPV cases (all types) in 2018: 53 Total number of cVDPV cases (all types) in 2017: 96 Papua New Guinea - 2 new cases of cVDPV1 were reported in the past week, bringing the total number of cases in 2018 to 14. These latest reported cases are from Jiwaka and Eastern Highlands provinces and had onset of paralysis on [13 Aug 2018 and 16 Jun 2018], respectively. - The polio teams are coordinating with the broader humanitarian emergency network as was done during the recent Ebola outbreak that infected 17 people. - 5 deaths were reported in 2002. Middle East - No new cases of cVDPV2 were reported in the past week in Syria. """) doc.add_tier(self.annotator) # 17 cases of Ebola self.assertEqual( doc.tiers['incidents'].spans[-2].metadata['resolvedDisease']['id'], 'http://purl.obolibrary.org/obo/DOID_4325') # The final report of 5 deaths should be associated with polio self.assertEqual( doc.tiers['incidents'].spans[-1].metadata['resolvedDisease']['id'], 'http://purl.obolibrary.org/obo/DOID_4953')
def test_full_article(self): """ Tests a full length article """ example = """ Last week an outbreak of the plague was reported by two different HHS departments in California. The first case involved a 72 year old man with 3 brothers and 1 sister. They should be tested in case they were infected. He had visited between 2 and 4 different countries in the last year. On 1/2/2017 he traveled to Zambia stopping at the lat/long: 121.125123, -90.234512 for a total of 7 days. When asked what his favorite number was he responded, "883814019938" though there has been heavy speculation that the actual favorite is 7003.3383. When searching within a 7 mile radius of the epicenter there were: 5 cases in Allaghaney resulting in 2 deaths 19 cases in The Little Town of Washington causing 8 deaths 2 cases in North San Juan which resulted in the spontanious existance of 1 supernatural being Health professionals have said that there is only a 12 percent chance these are accurate. The directory of Nevada County HHS was quoted as saying, "fifty thousand and twelve, four hundred and twelve, seventy three, one thousand, two hundred and sixteen". Concerned citizens have said, "50,012, 412, 73, 200 and 16" """ expected_counts = [1, 1, 5, 2, 19, 8, 2] doc = AnnoDoc(example) doc.add_tier(self.annotator) actual_counts = [ count.metadata['count'] for count in doc.tiers['counts'].spans if 'case' in count.metadata['attributes'] ] self.assertSequenceEqual(actual_counts, expected_counts)
def test_vernacular_names(self): doc = AnnoDoc(""" "A total of 5 of my buffaloes were found dead yesterday [Mon 21 May 2018]," Nan said. 114 backyard ducks, 27 backyard chickens have been destroyed and disposed of by the Rapid Response Team. """) doc.add_tier(self.annotator) self.assertEqual(len(doc.tiers['species']), 3)
def test_date_range(self): doc = AnnoDoc("The 7 new cases age between 17 and 70, and their onset dates vary between 19 May [2018] - 5 Jun [2018].") doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(2018, 5, 19), datetime.datetime(2018, 6, 6)])
def test_non_incident_counts_and_species(self): doc = AnnoDoc(""" Species / Morbidity / Mortality / Susceptible / Cases / Deaths / Killed and disposed of / Slaughtered Orange Spotted Snakehead (_Channa aurantimaculata_) / 100% / 1% / 32 / 30 / 1 / 28 / 3 """) doc.add_tier(self.annotator) metadatas = [ remove_empty_props(span.metadata) for span in doc.tiers['structured_incidents'] ] self.assertEqual(metadatas, [{ 'attributes': [], 'type': 'caseCount', 'value': 30, 'species': { 'id': 'tsn:642745', 'label': 'Channa aurantimaculata'} }, { 'attributes': [], 'type': 'deathCount', 'value': 1, 'species': { 'id': 'tsn:642745', 'label': 'Channa aurantimaculata'} }])
def test_complex_text(self): self.doc = AnnoDoc( "I'm married to Joe from New York City. " "That is in the United States who works for the Raytheon Corporation." ) self.doc.add_tier(self.annotator) self.assertEqual(len(self.doc.tiers['nes'].spans), 4) self.assertEqual(self.doc.tiers['nes'].spans[0].label, 'PERSON') self.assertEqual(self.doc.tiers['nes'].spans[0].text, 'Joe') self.assertEqual(self.doc.tiers['nes'].spans[0].start, 15) self.assertEqual(self.doc.tiers['nes'].spans[0].end, 18) self.assertEqual(self.doc.tiers['nes'].spans[1].label, 'GPE') self.assertEqual(self.doc.tiers['nes'].spans[1].text, 'New York City') self.assertEqual(self.doc.tiers['nes'].spans[1].start, 24) self.assertEqual(self.doc.tiers['nes'].spans[1].end, 37) self.assertEqual(self.doc.tiers['nes'].spans[2].label, 'GPE') self.assertEqual( self.doc.tiers['nes'].spans[2].text, 'the United States') self.assertEqual(self.doc.tiers['nes'].spans[2].start, 50) self.assertEqual(self.doc.tiers['nes'].spans[2].end, 67) self.assertEqual(self.doc.tiers['nes'].spans[3].label, 'ORG') self.assertEqual( self.doc.tiers['nes'].spans[3].text, 'the Raytheon Corporation') self.assertEqual(self.doc.tiers['nes'].spans[3].start, 82) self.assertEqual(self.doc.tiers['nes'].spans[3].end, 106)
def test_multi_section_table(self): doc = AnnoDoc(""" Disease update -------------- Confirmed, probable, and suspect cases and deaths from Ebola virus disease in Guinea, Liberia, and Sierra Leone, as of 30 Jun 2014 Type / New* / Confirmed / Probable / Suspect / Totals by country Guinea Cases / 3 / 293 / 88 / 32 / 413 Deaths / 5 / 193 / 82 / 28 / 303 Liberia Cases / 8 / 52 / 21 / 34 / 107 Deaths / 7 / 33 / 17 / 15 / 65 Sierra Leone Cases / 11 / 199 / 31 / 9 / 239 Deaths / 2 / 65 / 29 / 5 / 99 Totals Cases / 22 / 544 / 140 / 75 / 759 Deaths / 14 / 291 / 128 / 48 / 467 *New cases were reported between 25-29 Jun 2014 """) doc.add_tier(self.annotator) metadatas = [ remove_empty_props(span.metadata) for span in doc.tiers['structured_incidents'] ] self.assertEqual(metadatas[4]['type'], 'cumulativeCaseCount') self.assertEqual(metadatas[4]['dateRange'], [ datetime.datetime(2014, 6, 30, 0, 0), datetime.datetime(2014, 7, 1, 0, 0)]) self.assertEqual(metadatas[4]['value'], 413) self.assertEqual(metadatas[4]['location']['geonameid'], '2420477')
def test_northeast(self): doc = AnnoDoc(u""" Instead, a novel virus was isolated from a patient’s blood. Since March 2010, there were frequent reports of a unique group of hospitalized patients who presented with clinical symptoms similar to those of SFTS in Central and Northeast China (Fig. 1). On the basis of data from a primary investigation in 2009, an enhanced surveillance was implement- ed in selected provinces in China to further in- vestigate the cause and epidemiologic character- istics of SFTS. Here we describe the discovery and characterization of a novel phlebovirus in the Bunyaviridae family, designated SFTS bunyavirus (SFTSV), which is associated with SFTS. We also discuss the clinical manifestations of SFTS and the epidemiologic investigations. Methods Case Definition and Surveillance Methods Since 2009, we have implemented an active sur- veillance program in selected areas in Hubei and Henan provinces to identify patients with SFTS. The syndrome was characterized by acute fever (temperatures of 38°C or more) and thrombocyto- penia (platelet count, <100,000 per cubic millime- ter) of unknown cause.2 We collected blood sam- ples from hospitalized patients whose symptoms fulfilled the criteria of the case definition. We excluded patients whose symptoms fit these crite- ria but who had other clinical or laboratory-con- firmed diagnoses. We defined a laboratory-confirmed case as meeting one or more of the following criteria: the isolation of SFTSV from the patient’s serum, the detection of SFTSV RNA in the patient’s se- rum during the acute phase of the illness, or the detection of seroconversion or an elevation by a factor of four in serum IgG antibodies against SFTSV on enzyme-linked immunosorbent assay (ELISA), indirect immunof luorescence assay, or neutralization testing in serum obtained during the convalescent phase. If possible, we collected serum samples within 2 weeks after the onset of fever and again during the convalescent phase. We also collected serum samples from 200 patient- matched healthy persons living in the same areas and during the same time period. The research protocol was approved by the human bioethics committee of the Chinese Center for Disease Con- trol and Prevention, and all participants provided written informed consent. Isolation of an Unknown Pathogen In June 2009, a blood sample in heparin antico- agulant was obtained on day 7 after the onset of illness from a patient from Xinyang City in Henan Province. Because the cause of the illness was un- known, we designed a strategy to isolate the patho- gen by inoculating multiple cell lines susceptible to both viral and rickettsial agents, including hu- man cell line HL60; animal cell lines DH82, L929, Vero, and Vero E6; and tick cell line ISE6. The pa- tient’s white cells were used to inoculate cell mono- layers. The cells were cultured at 37°C in a 5% carbon dioxide atmosphere with media changes twice a week. In 2010, we used a related strategy to isolate an additional 11 strains of the virus by inoculation of serum or homogenized white cells onto Vero cells. Electron Microscopy A DH82-cell monolayer that was infected with SFTSV in T25 flasks was fixed for transmission electron microscopy with Ito solution, as de- scribed previously.3 Ultrathin sections were cut on a Reichert–Leica Ultracut S ultramicrotome, stained with lead citrate and examined in a Phil- ips 201 or CM-100 electron microscope at 60 kV. Negative-stain electron microscopy was performed on virions purified from a clarified culture super- natant of infected Vero cells concentrated by a factor of 100.4,5 Genetic Analysis For the first SFTSV isolate, formalin-fixed cell cul- ture was used to extract viral RNA using a High Pure FFPE RNA Micro Kit (Roche Applied Sci- ence). The virus was sequenced with the use of the restriction-fragment–length-polymorphism assay with amplified complementary DNA, as described previously.6 For the remaining 11 strains of the virus, the whole genomes were sequenced with the use of the sequence-independent, single-primer amplification (SISPA) method.7 The 5' and 3' ter- minals of viral RNA segments were determined with a RACE Kit (Invitrogen). Phylogenetic analy- ses were performed with the neighbor-joining method with the use of the Poisson correction and complete deletion of gaps. Neutralization Assay For microneutralization testing, serial dilutions of serum samples were mixed with an equal vol- ume of 100 median tissue-culture infectious dos- es of SFTSV (strain HB29) and incubated at 37°C for 1.5 hours. The mixture was then added to a 96-well plate containing Vero cells in quadrupli- cate. The plates were incubated at 37°C in a 5% carbon dioxide atmosphere for 12 days. Viral in- fection was detected on specific immunofluores- cence assays in serum samples from patients with laboratory-confirmed infection. The end-point ti- ter was expressed as the reciprocal of the highest dilution of serum that prevented infection. Polymerase Chain Reaction RNA that was extracted from serum, whole blood, or homogenized arthropods was amplified with the use of a one-step, multiplex real-time reverse- transcriptase polymerase chain reaction (RT-PCR) with primers for SFTSV (Qiagen). The cutoff cycle- threshold value for a positive sample was set at 35 cycles. Nested RT-PCR and sequencing were used to verify samples from which only one ge- nomic segment was amplified. Virus Isolation The first SFTSV (strain DBM) was isolated from a 42-year-old man from Henan Province. A month after inoculation of cell monolayers with white cells obtained from the patient, virus-induced cellular changes visible on light microscopy (cyto- pathic effect) were observed in DH82 cells but not in the other cell lines. The morphologic features of infected DH82 cells changed from round mono- cytes to an elongated shape, which had granular particles in the cytoplasm (Fig. 2A). After several passages in culture, the cytopathic effect usually appeared on day 4 after inoculation of a fresh monolayer. Subsequently, 11 additional strains of the virus were isolated from serum samples ob- tained from patients during the acute phase of illness in six provinces with the use of Vero cells (Table 1 in the Supplementary Appendix, available with the full text of this article at NEJM.org). SFTSV can infect a variety of cells, including L929, Vero E6, Vero (Fig. 2B), and DH82 cells, but it re- sulted in the cytopathic effect only in DH82 cells. The viral particles were spheres with a diameter of 80 to 100 nm. Negative-stain electron microscopy of SFTSV particles that were purified from the su- pernatants of infected Vero cells revealed complex surface projections (Fig. 2C). Transmission electron microscopy revealed viral particles in the DH82-cell cytoplasm. The virions were observed inside vacu- oles, presumably in the Golgi apparatus (Fig. 2D). Partial sequences were obtained from the first isolated virus strain DBM, and the complete ge- nomes of 11 additional human isolates of SFTSV were determined. (GenBank accession numbers are provided in Table 1 in the Supplementary Ap- pendix.) All isolates including strain DBM were closely related (96% homology of nucleotide se- quences for all segments). The terminals of the three genomic segments of SFTSV were found to be similar to counterparts in other phlebovirus- es.8 The L segment contains 6368 nucleotides with one open reading frame encoding 2084 amino acids. The M segment contains 3378 nu- cleotides with one open reading frame encoding 1073 amino acid precursors of glycoproteins (Gn and Gc). The S segment contains 1744 nucleo- tides of ambisense RNA encoding two proteins, the N and NSs proteins, in opposite orientations, separated by a 62-bp intergenic region. Phylogenetic trees based on partial or complete viral genomic sequences of L, M, and S segments from strains DBM, HN6, and HB29 showed that SFTSV was related to prototypic viruses of the five genera of Bunyaviridae (Fig. 1 in the Supple- mentary Appendix). Among the genera orthobun- yavirus, hantavirus, nairovirus, phlebovirus, and tospovirus, SFTSV belongs to the phlebovirus genus8 but was more distantly related to proto- typic viruses in the other four genera. To verify this finding, we carried out a phylogenetic analy- sis, using complete deduced amino acid sequenc- es coding for RNA-dependent RNA polymerase, glycoproteins (Gn and Gc), and N and NSs pro- teins of SFTSV (strains HB29, HN6, AN12, LN2, JS3, and SD4) from six provinces in China, as com- pared with the other known phleboviruses (Fig. 3). The generated phylogenetic tree showed that all SFTSV isolates clustered together but were near- ly equidistant from the other two groups,9 the Sandfly fever group (Rift Valley fever virus, Punta Toro virus, Toscana virus, Massila virus, and Sandfly fever Sicilian virus) and the Uukuniemi group. This suggested that SFTSV is the proto- type of a third group in the phlebovirus genus. A comparison of the similarity of amino acid sequences provided further evidence that SFTSV is distinct from the other phleboviruses (Table 2 in the Supplementary Appendix). Both RNA- dependent RNA polymerase and glycoproteins of SFTSV are slightly more closely related to coun- terparts in Uukuniemi virus. However, N pro- teins in SFTSV and Rift Valley fever virus had 41.4% similarity. In contrast, the amino acids in NSs proteins encoded by the S segment showed a similarity of only 11.2 to 16.0% with amino acids in other phleboviruses. Serologic Analysis We evaluated seroconversion against SFTSV in pa- tients with SFTS using three different methods: immunof luorescence assay, ELISA, and microneu- tralization. We chose a cohort of 35 patients with RT-PCR–confirmed SFTSV infection who had se- rum samples from both acute and convalescent phases of the illness. An elevation in the anti- body titer by a factor of four or seroconversion was observed in all 35 patients, as seen especially on microneutralization (Table 1). These results indi- cated that high levels of neutralizing antibodies were generated during the convalescent phase of the illness. An antibody titer of more than 1:25,600 on ELISA was present in 15 convalescent-phase serum samples, indicating a robust humoral im- mune response against SFTSV. Among the 35 se- ropositive samples, all SFTSV infections were confirmed on viral RNA sequencing, and 11 were confirmed on virus isolation. It is noteworthy that specific neutralizing antibodies against SFTSV persisted in some convalescent-phase serum sam- ples even 1 year after recovery. Clinical Symptoms The first patient, a 42-year-old male farmer, pre- sented with fever (temperatures of 39.2 to 39.7°C), fatigue, conjunctival congestion, diarrhea, abdom- inal pain, leukocytopenia, thrombocytopenia, pro- teinuria, and hematuria. Later, a unique group of hospitalized patients with acute high fever with thrombocytopenia was identified. We analyzed only 81 patients with laboratory-confirmed SFTSV infection who had a complete medical record for the clinical spectrum of SFTS. The clinical symp- toms of SFTS were nonspecific, and the major symptoms included fever and gastrointestinal symptoms. Regional lymphadenopathy was also frequently observed (Table 2). The most common abnormalities on laboratory testing were thrombo- cytopenia (95%) and leukocytopenia (86%) (Table 3). Multiorgan failure developed rapidly in most patients, as shown by elevated levels of serum ala- nine aminotransferase, aspartate aminotransfer- ase, creatine kinase, and lactate dehydrogenase. Proteinuria (in 84% of patients) and hematuria (in 59%) were also observed. Among the 171 con- firmed cases, there were 21 deaths (12%). However, it is not clear how SFTSV caused these deaths. Epidemiologic Investigation From June 2009 through September 2010, we de- tected SFTS bunyavirus RNA, specific antiviral antibodies, or both in 171 patients among 241 hospitalized patients who met the case defini- tion for SFTS2 in Central and Northeast China. These patients included 43 in Henan, 52 in Hubei, 93 in Shandong, 31 in Anhui, 11 in Jiangsu, and 11 in Liaoning provinces. In 2010, a total of 148 of 154 laboratory-confirmed cases (96%) occurred from May to July. The ages of the patients ranged from 39 to 83 years, and 115 of 154 patients (75%) were over 50 years of age. Of these 154 patients, 86 (56%) were women, and 150 (97%) were farm- ers living in wooded and hilly areas and working in the fields before the onset of disease. No SFTSV was identified on real-time RT-PCR and no anti- bodies against SFTSV were identified in serum samples that were collected from 200 patient- matched healthy control subjects in the endemic areas, from 180 healthy subjects from nonendem- ic areas, and from 54 patients with suspected hem- orrhagic fever with renal syndrome. Mosquitoes and ticks were commonly found in the patients’ home environment. However, viral RNA was not detected in any of 5900 mosquitoes tested. On the other hand, 10 of 186 ticks (5.4%) of the species Haemaphysalis longicornis that were collected from domestic animals in the areas where the patients lived contained SFTSV RNA. The viruses in the ticks were isolated in Vero cell culture, and the RNA sequences of these viruses were very closely related but not identical to the SFTSV isolated in samples obtained from the patients (data not shown). There was no epidemiologic evidence of human-to-human transmission of the virus. Discussion Although we have not fulfilled Koch’s postulates for establishing a causal relationship between a mi- crobe and a disease in their entirety, our findings suggest that SFTS is caused by a newly identified bunyavirus. These data include epidemiologic, clinical, and laboratory findings and several lines of evidence that include virus isolation, viral RNA detection, and molecular and serologic analyses. SFTS has been identified in Central and Northeast China, which covers all six provinces where sur- veillance for SFTS was carried out. """) doc.add_tier(self.annotator) self.assertTrue('Northeast' not in [span.text for span in doc.tiers['geonames'].spans])
def test_unparsable_date_bug(self): doc = AnnoDoc(""" Cases by Country / Week updated / Probable / Conf. / Virus type / DHF severe / Deaths Hispanic Caribbean Dominican Republic / 17 [week ending 28 Apr 2017] / 315 / 0 / D? / 15 / 0 Puerto Rico / 19 [week ending 12 May 2017] / 9 / 0 / D2 / 0 / 0 English, French, Dutch Caribbean American Virgin Islands / 19 [week ending 12 May 2017] / 1 / 1 / D? / 0 / 0 Andean Bolivia / 17 / [week ending 28 Apr 2017] / 4260 / 0 / D? / 34 / 0 Colombia / 20 [week ending 19 May 2017] / 12 552 / 8357 / D? / 131 / 36 Ecuador / 17 [week ending 28 Apr 2017] / 6075 / 6075 / D? / 6 / 3 Peru / 20 [week ending 19 May 2017] / 44 971 / 12 717 / D 2,3 / 137 / 54 Venezuela / 17 [week ending 28 Apr 2017] / 2722 / 309 / D? / 7 / 0 """) doc.add_tier(self.annotator)
def test_adjacent_state_name(self): doc = AnnoDoc( """3 at Washington County [Pennsylvania] Shelter Treated For Rabies Exposure""" ) doc.add_tier(self.annotator) self.assertEqual(doc.tiers['geonames'].spans[0].geoname['admin1_code'], 'PA')
def test_dashes_3(self): doc = AnnoDoc('Distribution of reported yellow fever cases from 1 Jul 2017-17 Apr 2018') doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(2017, 7, 1), datetime.datetime(2018, 4, 18)])
def test_adjacent_state_name_2(self): doc = AnnoDoc("In the city of Springfield, IL") doc.add_tier(self.annotator) self.assertEqual(doc.tiers['geonames'].spans[0].geoname['geonameid'], '4250542') self.assertEqual(doc.tiers['geonames'].spans[0].geoname['admin1_code'], 'IL')
def test_date_count_table_2(self): doc = AnnoDoc(""" | Report date | Cases | | 6 Oct 2017 | 26 | | 13 Oct 2017 | 29 | | 20 Oct 2017 | 34 |""") doc.add_tier(self.annotator) metadatas = [ remove_empty_props(span.metadata) for span in doc.tiers['structured_incidents'] ] self.assertEqual(metadatas, [{ 'value': 26, 'type': 'caseCount', 'attributes': [], 'dateRange': [ datetime.datetime(2017, 9, 30), datetime.datetime(2017, 10, 7)] }, { 'value': 29, 'type': 'caseCount', 'attributes': [], 'dateRange': [ datetime.datetime(2017, 10, 7), datetime.datetime(2017, 10, 14)] }, { 'value': 34, 'type': 'caseCount', 'attributes': [], 'dateRange': [ datetime.datetime(2017, 10, 14), datetime.datetime(2017, 10, 21)] }])
def test_since_date(self): text = 'nearly 5000 cases have been reported since 1 Sep 2010.' doc = AnnoDoc(text, date=datetime.datetime(2010, 12, 10)) doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(2010, 9, 1), datetime.datetime(2010, 12, 10)])
def test_dashes_2(self): text = 'First seen between 2010-1-1 - 2011-1-1' doc = AnnoDoc(text) doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(2010, 1, 1), datetime.datetime(2011, 1, 2)])
def test_dashes(self): text = 'Adenoviruses, first seen between 2010-1-1 and 2010-1-2' doc = AnnoDoc(text) doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(2010, 1, 1), datetime.datetime(2010, 1, 3)])
def test_1950s(self): text = 'Adenoviruses, first isolated in the 1950s from explanted adenoid tissue.' doc = AnnoDoc(text) doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(1950, 1, 1), datetime.datetime(1960, 1, 1)])
def test_inexact_range(self): text = 'From May to August of 2009 we languished there.' doc = AnnoDoc(text) doc.add_tier(self.annotator) self.assertEqual( doc.tiers['dates'].spans[0].datetime_range, [datetime.datetime(2009, 5, 1), datetime.datetime(2009, 9, 1)])
def create_annotations(article_uri, content): annotated_doc = AnnoDoc(content) for annotator in annotators: annotated_doc.add_tier(annotator) def get_span_uri(span): h = hashlib.md5() h.update(article_uri) h.update(str(span.start) + ':' + str(span.end)) return "http://www.eha.io/types/annotation/annie/span/" + str(h.hexdigest()) for tier_name in ['geonames', 'diseases', 'hosts', 'modes', 'pathogens', 'symptoms']: tier = annotated_doc.tiers[tier_name] update_query = make_template(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix eha: <http://www.eha.io/types/> prefix rdf: <http://www.w3.org/2000/01/rdf-schema#> prefix dc: <http://purl.org/dc/terms/> {% for span in spans %} INSERT DATA { <{{get_span_uri(span)}}> anno:annotator eha:annie {% if span.geoname %} ; rdf:type eha:geoname_annotation ; anno:geoname <http://sws.geonames.org/{{span.geoname.geonameid}}> {% else %} ; rdf:type eha:keyword_annotation ; anno:category "{{tier_name}}" {% endif %} ; anno:label "{{span.label | escape}}" ; anno:source_doc <{{source_doc}}> ; anno:start {{span.start}} ; anno:end {{span.end}} ; anno:selected-text "{{span.text | escape}}" } ; {% if tier_name == "diseases" %} INSERT DATA { {% for entity_uri in resolve_keyword(span.label) %} <{{entity_uri}}> dc:relation <{{get_span_uri(span)}}> . {% endfor %} } ; {% endif %} {% endfor %} INSERT DATA { <{{source_doc}}> anno:annotated_by eha:annie_1 } """).render( get_span_uri=get_span_uri, resolve_keyword=resolve_keyword, source_doc=article_uri, tier_name=tier_name, spans=tier.spans) sparql_utils.update(update_query)
def diagnose( self, content, diseases_only=False, content_date=None, use_infection_annotator=False, include_incidents=False): time_sofar = time_sofar_gen(datetime.datetime.now()) base_keyword_dict = self.keyword_extractor.transform([content])[0] feature_dict = self.keyword_processor.transform([base_keyword_dict]) X = self.dict_vectorizer.transform(feature_dict)[0] logger.info(time_sofar.next() + 'Computed feature vector') def diagnosis(i, p): scores = self.classifier.coef_[i] * X # Scores are normalized so they can be compared across different # classifications. norm = np.linalg.norm(scores) if norm > 0: scores /= norm scores *= p # These might be numpy types. I coerce them to native python # types so we can easily serialize the output as json. scored_keywords = zip(self.keywords, scores) keyword_scores = {} for keyword, score in scored_keywords: if score > 0 and keyword in base_keyword_dict: keyword_scores[keyword] = float(score) return { 'name': unicode(self.classifier.classes_[i]), 'probability': float(p), 'keywords': [{ 'name': unicode(kwd), 'score': float(score), } for kwd, score in scored_keywords if score > 0 and kwd in base_keyword_dict], 'inferred_keywords': [{ 'name': unicode(kwd), 'score': float(score), } for kwd, score in scored_keywords if score > 0 and kwd not in base_keyword_dict] } diseases = [diagnosis(i,p) for i,p in self.best_guess(X)] if diseases_only: return { 'diseases': diseases } logger.info(time_sofar.next() + 'Diagnosed diseases') anno_doc = AnnoDoc(content, date=content_date) anno_doc.add_tier(self.keyword_annotator) logger.info('keywords annotated') anno_doc.add_tier(self.resolved_keyword_annotator) logger.info('resolved keywords annotated') anno_doc.add_tier(self.date_annotator) logger.info('dates annotated') if use_infection_annotator: anno_doc.add_tier(self.infection_annotator) anno_doc.tiers['counts'] = anno_doc.tiers.pop('infections') attribute_remappings = { 'infection': 'case' } for span in anno_doc.tiers['counts']: span.metadata['attributes'] = [ attribute_remappings.get(attribute, attribute) for attribute in span.metadata['attributes']] else: anno_doc.add_tier(self.count_annotator) logger.info('counts annotated') anno_doc.add_tier(self.geoname_annotator) logger.info('geonames annotated') anno_doc.add_tier(StructuredIncidentAnnotator()) logger.info('structured incidents annotated') anno_doc.filter_overlapping_spans( tier_names=[ 'dates', 'geonames', 'diseases', 'hosts', 'modes', 'pathogens', 'symptoms' ] ) logger.info('filtering overlapping spans done') dates = [] for span in anno_doc.tiers['dates']: range_start, range_end = span.datetime_range dates.append({ 'type': 'datetime', 'name': span.text, 'value': span.text, 'textOffsets': [ [span.start, span.end] ], 'timeRange': { 'beginISO': range_start.isoformat().split('T')[0], 'begin': { 'year': range_start.year, 'month': range_start.month, 'date': range_start.day }, # The date range does not include the end day. 'endISO': range_end.isoformat().split('T')[0], 'end': { 'year': range_end.year, 'month': range_end.month, 'date': range_end.day }, } }) geonames_grouped = {} for span in anno_doc.tiers['geonames']: if not span.geoname['geonameid'] in geonames_grouped: geonames_grouped[span.geoname['geonameid']] = { 'type': 'location', 'name': span.geoname.name, 'geoname': span.geoname.to_dict(), 'textOffsets': [ [span.start, span.end] ] } else: geonames_grouped[ span.geoname['geonameid'] ]['textOffsets'].append( [span.start, span.end] ) logger.info(time_sofar.next() + 'Annotated geonames') counts = [] for span in anno_doc.tiers['counts'].without_overlaps(anno_doc.tiers['structured_data']): count_dict = dict(span.metadata) count_dict['type'] = 'count' count_dict['text'] = span.text count_dict['label']= span.label count_dict['textOffsets']= [[span.start, span.end]] counts.append(count_dict) # Include legacy case counts so the diagnositic dashboard # doesn't break. if 'case' in count_dict['attributes']: counts.append({ 'type': 'caseCount', 'text': count_dict['text'], 'value': count_dict['count'], 'modifiers': count_dict['attributes'], 'cumulative': "cumulative" in count_dict['attributes'], 'textOffsets': count_dict['textOffsets'] }) keyword_types = ['diseases', 'hosts', 'modes', 'pathogens', 'symptoms'] keyword_groups = {} for keyword_type in keyword_types: keyword_groups[keyword_type] = {} for span in anno_doc.tiers['keyword.' + keyword_type]: if span.label not in keyword_groups[keyword_type]: keyword_groups[keyword_type][span.label] = { 'type': keyword_type, 'value': span.label, 'textOffsets': [[span.start, span.end]] } else: keyword_groups[keyword_type][span.label]['textOffsets'].append( [span.start, span.end] ) resolved_keywords = [] for span in anno_doc.tiers['resolved_keywords'].without_overlaps(anno_doc.tiers['geonames']): resolved_keywords.append({ 'type': 'resolvedKeyword', 'resolutions': span.metadata['resolutions'], 'text': span.text, 'textOffsets': [[span.start, span.end]]}) result = { 'diagnoserVersion': self.__version__, 'dateOfDiagnosis': datetime.datetime.now(), 'diseases': diseases, 'structuredIncidents': [ dict(span.metadata, textOffsets=[[span.start, span.end]]) for span in anno_doc.tiers['structured_incidents']], 'features': counts +\ geonames_grouped.values() +\ dates +\ keyword_groups['diseases'].values() +\ keyword_groups['hosts'].values() +\ keyword_groups['modes'].values() +\ keyword_groups['pathogens'].values() +\ keyword_groups['symptoms'].values() +\ resolved_keywords} if include_incidents: result['incidents'] = [] anno_doc.add_tier(IncidentAnnotator()) for incident_span in anno_doc.tiers['incidents']: metadata = incident_span.metadata incident_data = { 'offsets': [span.start, span.end], 'type': metadata['type'], 'value': metadata['value'], 'dateRange': [d.isoformat().split('T')[0] for d in metadata['dateRange']], 'locations': metadata['locations'], 'species': metadata['species'], 'status': metadata.get('status'), 'resolvedDisease': metadata.get('resolvedDisease'), 'annotations': { 'case': [{ 'offsets': [incident_span.start, incident_span.end] }] } } if 'count_annotation' in metadata: count_annotation = metadata['count_annotation'] incident_data['annotations'] = { 'case': [{ 'offsets': [count_annotation.start, count_annotation.end] }], 'date': [ { 'offsets': [anno.start, anno.end] } for anno in metadata['date_territory'].metadata ], 'location': [ { 'offsets': [anno.start, anno.end] } for anno in metadata['geoname_territory'].metadata ], 'disease': [ { 'offsets': [anno.start, anno.end] } for anno in metadata['disease_territory'].metadata ] } result['incidents'].append(incident_data) return result
from templater import make_template import sparql_utils import hashlib from epitator.annotator import AnnoDoc from epitator.keyword_annotator import KeywordAnnotator from epitator.geoname_annotator import GeonameAnnotator import re from pylru import lrudecorator annotators = [ KeywordAnnotator(), GeonameAnnotator(), ] # Test that the keyword annotator is set up correctly test_doc = AnnoDoc("ebola influenza glanders dermatitis") for annotator in annotators: test_doc.add_tier(annotator) assert( set(disease.label for disease in test_doc.tiers["diseases"].spans) - set("ebola influenza glanders dermatitis".split(" ")) == set()) @lrudecorator(500) def resolve_keyword(keyword): query = make_template(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> prefix obo: <http://purl.obolibrary.org/obo/> prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?entity WHERE {