Exemplo n.º 1
0
    def test_init(self):
        e = EDTFDate()
        self.assertEqual(unicode(e), "")

        e = EDTFDate('2001-02')
        self.assertEqual(unicode(e), '2001-02')

        e = EDTF.from_natural_text(None)
        self.assertEqual(unicode(e), "")
Exemplo n.º 2
0
    def test_natural_language(self):
        for i, o in [

            ('', ''),
            ('this isn\'t a date', ''),
            ('90', '1990'), #implied century
            ('1860', '1860'),
            ('the year 1800', '1800'),
            ('the year 1897', '1897'),
            ('January 2008', '2008-01'),
            ('January 12, 1940', '1940-01-12'),

            # uncertain/approximate
            ('1860?', '1860?'),
            ('1862 (uncertain)', '1862?'),
            ('maybe 1862', '1862?'),
            ('1862 maybe', '1862?'),
            ('1862 guess', '1862?'),
            ('uncertain: 1862', '1862?'),
            ('uncertain: Jan 18 1862', '1862-01-18?'),
            ('~ Feb 1812', '1812-02~'),
            ('circa Feb 1812', '1812-02~'),
            ('Feb 1812 approx', '1812-02~'),
            ('c1860', '1860~'), #different abbreviations
            ('c.1860', '1860~'), #with or without .
            ('ca1860', '1860~'),
            ('ca.1860', '1860~'),
            ('c 1860', '1860~'), # with or without space
            ('c. 1860', '1860~'),
            ('ca. 1860', '1860~'),
            ('approx 1860', '1860~'),
            ('1860 approx', '1860~'),
            ('1860 approximately', '1860~'),
            ('approximately 1860', '1860~'),
            ('about 1860', '1860~'),
            ('about Spring 1849', '1849-21~'),
            ('notcirca 1860', '1860'), # avoid words containing circa
            ('attica 1802', '1802'), #avoid false positive circa at the end of preceding word
            ('attic. 1802', '1802'), #avoid false positive circa

            # masked precision
            ('1860s', '186x'), #186x has decade precision, 186u has year precision.

            # masked precision + uncertainty
            ('ca. 1860s', '186x~'),
            ('c. 1860s', '186x~'),
            ('Circa 1840s', '184x~'),
            ('circa 1840s', '184x~'),
            ('ca. 1860s?', '186x?~'),
            ('uncertain: approx 1862', '1862?~'),

            # masked precision with first decade (ambiguous)
            ('1800s', '18xx'), # without additional uncertainty, use the century
            ('2000s', '20xx'), # without additional uncertainty, use the century
            ('c1900s', '190x~'), # if there's additional uncertainty, use the decade
            ('c1800s?', '180x?~'), # if there's additional uncertainty, use the decade

            # unspecified
            ('January 12', 'uuuu-01-12'),
            ('January', 'uuuu-01'),
            ('10/7/2008', '2008-10-07'),
            ('7/2008', '2008-07'),

            #seasons
            ('Spring 1872', '1872-21'),
            ('Summer 1872', '1872-22'),
            ('Autumn 1872', '1872-23'),
            ('Fall 1872', '1872-23'),
            ('Winter 1872', '1872-24'),

            # before/after
            ('earlier than 1928', 'unknown/1928'),
            ('before 1928', 'unknown/1928'),
            ('after 1928', '1928/unknown'),
            ('later than 1928', '1928/unknown'),
            ('before January 1928', 'unknown/1928-01'),
            ('before 18 January 1928', 'unknown/1928-01-18'),

            # before/after approx
            ('before approx January 18 1928', 'unknown/1928-01-18~'),
            ('before approx January 1928', 'unknown/1928-01~'),
            ('after approx January 1928', '1928-01~/unknown'),
            ('after approx Summer 1928', '1928-22~/unknown'),

            # before/after and uncertain/unspecificed
            ('after about the 1920s', '192x~/unknown'),
            ('before about the 1900s', 'unknown/190x~'),
            ('before the 1900s', 'unknown/19xx'),

            # unspecified
            # ('decade in 1800s', '18ux'), #too esoteric
            # ('decade somewhere during the 1800s', '18ux'), #lengthier. Keywords are 'in' or 'during'
            ('year in the 1860s', '186u'), #186x has decade precision, 186u has year precision.
            ('year in the 1800s', '18xu'),
            ('year in about the 1800s', '180u~'),
            ('month in 1872', '1872-uu'),
            ('day in Spring 1849', '1849-21-uu'),
            ('day in January 1872', '1872-01-uu'),
            ('day in 1872', '1872-uu-uu'),
            ('birthday in 1872', '1872'), #avoid false positive at end of preceding word

            #centuries
            ('1st century', '00xx'),
            ('10c', '09xx'),
            ('19th century', '18xx'),
            ('19th century?', '18xx?'),
            ('before 19th century', 'unknown/18xx'),
            ('19c', '18xx'),
            ('15c.', '14xx'),
            ('ca. 19c', '18xx~'),
            ('~19c', '18xx~'),
            ('about 19c', '18xx~'),
            ('19c?', '18xx?'),
            ('c.19c?', '18xx?~'),

            # c-c-c-combo
            # just showing off now...
            ('a day in about Spring 1849?', '1849-21-uu?~'),

            # simple ranges. Not all of these results are correct EDTF, but
            # this is as good as the EDTF implementation and simple natural
            # language parser we have.
            ('1851-1852', '1851/1852'),
            ('1851-1852; printed 1853-1854', '1851/1852'),
            ('1851-52', '1851/1852'),
            ('1852 - 1860', '1852/1860'),
            ('1856-ca. 1865', '1856/1865~'),
            ('1857-mid 1860s', '1857/186x'),
            ('1858/1860', '1858/1860'),
            ('1860s-1870s', '186x/187x'),
            ('1861, printed 1869', '1861'),
            ('1861-67', '1861/1867'),
            ('1861-67 (later print)', '1861/1867'),
            ('1863 or 1864', '1863'),
            ('1863, printed 1870', '1863'),
            ('1863, printed ca. 1866', '1863'),
            ('1864 or 1866', '1864'),
            ('1864, printed ca. 1864', '1864'),
            ('1864-1872, printed 1870s', '1864/1872'),
            ('1868-1871?', '1868/1871?'),
            ('1869-70', '1869/1870'),
            ('1870s, printed ca. 1880s', '187x'),
            ('1900-1903, cast before 1929', '1900/1903'),
            ('1900; 1973', '1900'),
            ('1900; printed 1912', '1900'),
            ('1915 late - autumn 1916', '1915/1916-23'),
            ('1915, from Camerawork, October 1916', '1915'),
            ('1920s -early 1930s', '192x/193x'),
            ('1930s, printed early 1960s', '193x'),
            ('1932, printed 1976 by Gunther Sander', '1932'),
            ('1938, printed 1940s-1950s', '1938'),

            # for these to work we need to recast is_uncertain and is_approximate
            # such that they work on different parts. Probably worth rolling our own
            # dateparser at this point.
            # ('July in about 1849', '1849~-07'),
            # ('a day in July in about 1849', '1849~-07-uu'),
            # ('a day in Spring in about 1849', '1849~-21-uu'),
            # ('a day in about July? in about 1849', '1849~-07?~-uu'),
            # ('a day in about Spring in about 1849', '1849~-21~-uu'),
            # ('maybe January in some year in about the 1830s', '183u~-01?'),
            # ('about July? in about 1849', '1849~-07?~'),
        ]:
            e = EDTF.from_natural_text(i)
            if e:
                self.assertEqual(unicode(e), o)
            else:
                self.assertEqual(e, o)
Exemplo n.º 3
0
    def test_different_types(self):
        e = EDTF('1983')
        self.assertEqual(e.is_interval, False)
        self.assertEqual(e.sort_date_earliest().isoformat(), '1983-01-01')
        self.assertEqual(e.sort_date_latest().isoformat(), '1983-12-31')
        self.assertEqual(e.date_earliest().isoformat(), '1983-01-01')
        self.assertEqual(e.date_latest().isoformat(), '1983-12-31')
        self.assertEqual(e.start_date_earliest().isoformat(), '1983-01-01')
        self.assertEqual(e.start_date_latest().isoformat(), '1983-01-01')
        self.assertEqual(e.end_date_earliest().isoformat(), '1983-12-31')
        self.assertEqual(e.end_date_latest().isoformat(), '1983-12-31')

        e = EDTF('1983/1985')
        self.assertEqual(e.is_interval, True)
        self.assertEqual(e.sort_date_earliest().isoformat(), '1983-01-01')
        self.assertEqual(e.sort_date_latest().isoformat(), '1985-12-31')
        self.assertEqual(e.date_earliest().isoformat(), '1983-01-01')
        self.assertEqual(e.date_latest().isoformat(), '1985-12-31')
        self.assertEqual(e.start_date_earliest().isoformat(), '1983-01-01')
        self.assertEqual(e.start_date_latest().isoformat(), '1983-12-31')
        self.assertEqual(e.end_date_earliest().isoformat(), '1985-01-01')
        self.assertEqual(e.end_date_latest().isoformat(), '1985-12-31')