Пример #1
0
def find_date(symbol, date_type):
    """
    Resolve birth and death dates from infoboxes, or, if it is not found,
    from the first paragraph
    """
    for cls in InfoboxClassifier().classify(symbol):
        ibox_date = InfoboxResolver().resolve_infobox(cls, symbol, date_type)
        if ibox_date is not None:
            return ibox_date

    # TODO: look at categories for dates

    article = get_article(symbol)
    text = article.paragraphs()[0]  # the first paragraph
    for s, e in iter_paren(text, "."):
        paren = text[s:e]

        for ovl in overlay_parse.dates.just_ranges(paren):
            if date_type == 'birth-date':
                return lispify(ovl[0], typecode='yyyymmdd')
            elif date_type == 'death-date':
                return lispify(ovl[1], typecode='yyyymmdd')

        # If there is just one date and we need a birth date, get that
        if date_type == 'birth-date':
            for ovl in overlay_parse.dates.just_dates(paren):
                return lispify(ovl, typecode='yyyymmdd')
Пример #2
0
    def synonyms(self, symbol):
        synonyms = set()

        for si in self.synonym_inducers:
            synonyms.update(si.induce(symbol))

        return lispify(synonyms)
Пример #3
0
    def resolve_infobox(self, cls, symbol, attr):
        """
        Return the value of the attribute for the article.
        """

        if "\n" in symbol:
            # There are no newlines in article titles
            return None

        if isinstance(attr, LispType):
            typecode, attr = attr.typecode, attr.val
        else:
            typecode, attr = self._typecode, attr

        infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher)

        for ibox in infoboxes:
            result = ibox.get(attr)
            if result:
                self.log().info("Found infobox attribute '%s'" % attr)
                assert(isinstance(result, unicode))  # TODO: remove for production

                return lispify(result, typecode=typecode, infobox_attr=attr)

            self.log().warning("Could not find infobox attribute '%s'" % attr)

        self.log().warning("Could not resolve attribute '%s' for '%s' with "
                           "class '%s'", attr, symbol, cls)
Пример #4
0
 def url(self, article, _):
     """
     Note that this url is the wikipedia.org url. NOT the place where
     we got the page.
     """
     # Will also teake care of redirections.
     article = get_article(article)
     url = article.url()
     return lispify(url, typecode='url')
Пример #5
0
    def short_article(self, symbol, _):
        """
        The first paragraph of the article, or if the first paragraph is
        shorter than 350 characters, then returns the first paragraphs such
        that the sum of the rendered characters is at least 350.
        """

        # TODO: check if the first paragraph is shorter than 350 characters
        first_paragraph = get_article(symbol).first_paragraph(keep_html=True)
        return lispify(first_paragraph, typecode='html')
Пример #6
0
    def number(self, article, _):
        """
        True if it is plural.
        """
        # First paragraph refers more often to the symbol itself
        # rather than things related to it.
        txt = get_article(article).first_paragraph()

        nay = sum(map(txt.count, [' is ', ' was ', ' has ']))
        yay = sum(map(txt.count, [' are ', ' were ', ' have ']))

        # inequality because there are many more nays
        return lispify(yay > nay, typecode='calculated')
Пример #7
0
    def proper(self, article, _):
        """
        Get a quick boolean answer based on the symbol text and the
        article text.
        """

        # Blindly copied by the ruby version
        a = re.sub(r"\s*\(.*\)\s*", "", article.replace("_", " "))
        txt = totext(get_article(article).html_source())
        ret = (txt.count(a.lower()) - txt.count(". " + a.lower()) <
               txt.count(a))

        return lispify(ret, typecode='calculated')
Пример #8
0
    def get_callable(self, symbol):
        """
        Given a function name return the callable. Keywords should lispify
        the arguments.
        """

        if isinstance(symbol, Symbol):
            return self.resources()[symbol._name]

        if isinstance(symbol, Keyword):
            return lambda *args: lispify(*args, typecode=symbol._name)

        raise TypeError("Could not resolve function %s (type %s)."
                        % (symbol, str(type(symbol))))
Пример #9
0
    def attributes(self, cls, symbol):
        """
        Get all infobox attributes
        """

        attributes = []
        infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher)

        for ibox in infoboxes:
            for k, v in ibox.markup_parsed_iter():
                rendered = ibox.rendered_attributes().get(k.replace('-', '_'))
                tmp = dict(code=k.upper(), rendered=rendered)
                attributes.append(tmp)

        return lispify(attributes)
Пример #10
0
    def get(self, cls, symbol, attr):
        """
        Gets the value of a symbol's attribute.

        :param cls: Wikipedia class of the symbol
        :param symbol: the Wikipedia article
        :param attr: the attribute to get
        :returns: the attribute's value or an error, lispified
        """
        for ar in self.resolvers:
            res = ar.resolve(cls, symbol, attr)
            if res is not None:
                break

        return lispify([res])
Пример #11
0
    def coordinates(self, article, _):
        for ibox in get_infoboxes(article):
            src = ibox.html_source()
            if src is None:
                return None

            xpath = ".//span[@id='coordinates']"
            lat = src.find(xpath + "//span[@class='latitude']")
            lon = src.find(xpath + "//span[@class='longitude']")

            if lat is None or lon is None:
                return None

            nlat = self._dton(totext(lat))
            nlon = self._dton(totext(lon))

            return lispify([nlat, nlon], typecode='coordinates')
Пример #12
0
    def image(self, article, attribute):
        # Make sure we are not getting back a LispType.

        infoboxes = get_infoboxes(article)
        imgs = [ibx.get('image') for ibx in infoboxes]
        if not imgs:
            return None

        img = imgs[0]
        fnam = img.replace(" ", "_")
        if "File:" in img:
            fnam = fnam.split("File:")[1]

        # TODO : is this a temporary fix? investigate what this annotation means
        # see 'Bill Clinton' for an example
        if "{{!}}border" in img:
            fnam = fnam.split("{{!}}border")[0]

        caps = [ibx.get('caption') for ibx in infoboxes]
        caps = filter(lambda x: x, caps)  # remove None values
        return lispify([0, fnam] + ([markup_unlink(caps[0])] if caps else []))
Пример #13
0
 def test_date_with_range(self):
     # 2010 is in the given range, thus it will precede 8,8,1991
     ed = lispify("2010 8.9.1991 - 2012 on August the 8th 1991",
                  typecode="yyyymmdd")
     self.assertEqual(ed, '(:yyyymmdd 20100000)')
Пример #14
0
 def test_date_multiple_voting(self):
     ed = lispify("2010 8.8.1991 on August the 8th 1991",
                  typecode="yyyymmdd")
     self.assertEqual(ed, '(:yyyymmdd 20100000)')
Пример #15
0
 def test_date_simple(self):
     ed = lispify("coming on August the 8th", typecode="yyyymmdd")
     self.assertEqual(ed, '(:yyyymmdd 00000808)')
Пример #16
0
 def test_list_of_dict_with_typecode(self):
     l = [{'foo': 'bar'}, {'foo': 'baz'}]
     self.assertEqual(lispify(l, typecode='html'),
                      '(:html (:foo "bar") (:foo "baz"))')
Пример #17
0
 def test_list_of_dict(self):
     l = [{'foo': 'bar'}, {'foo': 'baz'}]
     self.assertEqual(lispify(l), '((:foo "bar") (:foo "baz"))')
Пример #18
0
 def test_double_nested_list(self):
     l = [[0, ['v0', 'foo']], [1, ['v1', 'bar']]]
     self.assertEqual(lispify(l), '((0 ("v0" "foo")) (1 ("v1" "bar")))')
Пример #19
0
 def test_dict_with_escaped_string(self):
     self.assertEqual(lispify({'a': 1, 'b': '"foo"'}),
                      '(:a 1 :b "\\"foo\\"")')
Пример #20
0
 def test_keyword_with_typecode(self):
     self.assertEqual(lispify(':feminine', typecode='calculated'),
                      '(:calculated :feminine)')
Пример #21
0
 def test_keyword(self):
     self.assertEqual(lispify(':feminine'), ":feminine")
Пример #22
0
 def test_bool(self):
     self.assertEqual(lispify(True), 't')
     self.assertEqual(lispify(False), 'nil')
Пример #23
0
 def test_bool_with_typecode(self):
     self.assertEqual(lispify(False, typecode='calculated'),
                      '(:calculated nil)')
Пример #24
0
 def test_string_with_typecode(self):
     self.assertEqual(lispify("bar", typecode="html"), '(:html "bar")')
Пример #25
0
 def test_string_not_keyword(self):
     self.assertEqual(lispify(':not a keyword'), '":not a keyword"')
Пример #26
0
 def test_list(self):
     l = ['wikipedia-class1', 'wikipedia-class2']
     self.assertEqual(lispify(l), '("wikipedia-class1" "wikipedia-class2")')
Пример #27
0
 def test_dict(self):
     self.assertEqual(lispify({'a': 1, 'b': "foo"}),
                      '(:a 1 :b "foo")')
Пример #28
0
 def test_list_with_typecode(self):
     l = [44, 35]
     self.assertEqual(lispify(l, typecode='coordinates'),
                      '(:coordinates 44 35)')
Пример #29
0
 def test_dict_with_list(self):
     self.assertEqual(lispify({'a': 1, 'b': ['foo', 'bar']}),
                      '(:a 1 :b ("foo" "bar"))')
Пример #30
0
 def test_nested_list(self):
     l = [[0, 'foo'], [1, '"bar"']]
     self.assertEqual(lispify(l), '((0 "foo") (1 "\\"bar\\""))')