예제 #1
0
    def _best_attributes(self):
        """
        Returns the best possible list of attributes from various infobox 
        template pages

        Consider the "president" infobox. We first try to follow any 
        redirects, and find that "president" redirects to "officeholder".

        Then we fetch the wikimarkup source of the template documentation 
        subpage at: "/index.php?title=Template:Infobox_officeholder/doc"

        If it exists, we use the <templatedata> block that contains a JSON
        object with attributes. For more info on <templatedata>, see:
        https://www.mediawiki.org/wiki/Extension:TemplateData. Some templates 
        do not have documentation subpages, just documentation pages (no 
        trailing "/doc").

        We look at the rendered HTML of subpages and pages and use a regex 
        that looks for attributes like "| name    =  BBC News".  
        """
        template = self.symbol

        try:
            page = get_article(self.symbol)
            template = page.title()
        except LookupError:
            self.log().warn("Could not find doc any template pages for "
                            "template: \"%s\".",
                            self.symbol)
            return []

        attributes = []
        doc_page = get_article(template)
        try:
            doc_subpage = get_article(template + '/doc')

            markup = doc_subpage.markup_source(expiry=Expiry.LONG)
            attributes.extend(self._attributes_from_template_data(markup))

            html = doc_subpage.html_source(expiry=Expiry.LONG)
            attributes.extend(self._attributes_from_html(html))
        except ValueError:
            self.log().error("Error parsing <templatedata> json for %s. "
                             "Check the page for trailing commas. ",
                             template, exc_info=True)
        except LookupError:
            self.log().warn("Could not find doc subpage for template: \"%s\".",
                            template)

        html = doc_page.html_source(expiry=Expiry.LONG)
        attributes.extend(self._attributes_from_html(html))

        return attributes
예제 #2
0
    def test_types(self):
        article = get_article("Vladimir Putin", self.fetcher)
        # TODO : fix case inconsistency in infobox_tree
        types = ["officeholder", "martial artist", "Person", "Sportsperson",
                 "Other sportsperson"]

        self.assertItemsEqual(article.types(), types)
예제 #3
0
def sort_named(named, *args):
    # TODO: clean up, this was directly translated from Ruby WikipediaBase
    article_lengths = {}
    for a in args:
        try:
            article_lengths[a] = len(' '.join(get_article(a).paragraphs()))
        except LookupError:
            pass
    
    def compare(a, b):
        named_eq = lambda x: x == named
        named_ieq = lambda x: x.lower() == named.lower()

        if named_eq(a) != named_eq(b):
            return -1 if named_eq(a) else 1
        elif named_ieq(a) != named_ieq(b):
            return -1 if named_ieq(a) else 1
        else:
            len_a = article_lengths[a]
            len_b = article_lengths[b]
            if len_a < len_b: return -1
            elif len_a == len_b: return 0
            elif len_a > len_b: return 1

    return sorted(article_lengths.keys(), cmp=compare)
예제 #4
0
def find_date(symbol, date_type):
    """
    Resolve birth and death dates from infoboxes, or, if it is not found,
    from the first paragraph
    """
    for cls in InfoboxClassifier().classify(symbol):
        ibox_date = InfoboxResolver().resolve_infobox(cls, symbol, date_type)
        if ibox_date is not None:
            return ibox_date

    # TODO: look at categories for dates

    article = get_article(symbol)
    text = article.paragraphs()[0]  # the first paragraph
    for s, e in iter_paren(text, "."):
        paren = text[s:e]

        for ovl in overlay_parse.dates.just_ranges(paren):
            if date_type == 'birth-date':
                return lispify(ovl[0], typecode='yyyymmdd')
            elif date_type == 'death-date':
                return lispify(ovl[1], typecode='yyyymmdd')

        # If there is just one date and we need a birth date, get that
        if date_type == 'birth-date':
            for ovl in overlay_parse.dates.just_dates(paren):
                return lispify(ovl, typecode='yyyymmdd')
예제 #5
0
 def url(self, article, _):
     """
     Note that this url is the wikipedia.org url. NOT the place where
     we got the page.
     """
     # Will also teake care of redirections.
     article = get_article(article)
     url = article.url()
     return lispify(url, typecode='url')
예제 #6
0
    def short_article(self, symbol, _):
        """
        The first paragraph of the article, or if the first paragraph is
        shorter than 350 characters, then returns the first paragraphs such
        that the sum of the rendered characters is at least 350.
        """

        # TODO: check if the first paragraph is shorter than 350 characters
        first_paragraph = get_article(symbol).first_paragraph(keep_html=True)
        return lispify(first_paragraph, typecode='html')
예제 #7
0
    def proper(self, article, _):
        """
        Get a quick boolean answer based on the symbol text and the
        article text.
        """

        # Blindly copied by the ruby version
        a = re.sub(r"\s*\(.*\)\s*", "", article.replace("_", " "))
        txt = totext(get_article(article).html_source())
        ret = (txt.count(a.lower()) - txt.count(". " + a.lower()) <
               txt.count(a))

        return lispify(ret, typecode='calculated')
예제 #8
0
    def number(self, article, _):
        """
        True if it is plural.
        """
        # First paragraph refers more often to the symbol itself
        # rather than things related to it.
        txt = get_article(article).first_paragraph()

        nay = sum(map(txt.count, [' is ', ' was ', ' has ']))
        yay = sum(map(txt.count, [' are ', ' were ', ' have ']))

        # inequality because there are many more nays
        return lispify(yay > nay, typecode='calculated')
예제 #9
0
    def types(self):
        """
        The infobox type. Extend means search in other places except here
        (ie find equivalent ones, parent ones etc).
        """
        if not hasattr(self, "_sc"):
            self._sc = ibx_type_superclasses()

        template = self.template()
        types = filter(lambda t: t is not None, [self._to_type(template)])

        t = self._to_type(template)
        title = get_article(template, self.fetcher).title()
        if t != self._to_type(title):
            types.append(self._to_type(title))

        if t in self._sc:
            types.extend(self._sc[t])

        return types
예제 #10
0
    def _guess_gender(self, symbol):
        male_prep = ["he", "him", "his"]
        female_prep = ["she", "her", "hers"]
        neuter_prep = ["it", "its", "they", "their", "theirs"]

        article = get_article(symbol, fetcher=self.fetcher)
        full_text = "\n\n".join(article.paragraphs()).lower()

        def word_search(w):
            return len(re.findall(r"\b%s\b" % w, full_text, re.I))

        male_words = sum(map(word_search, male_prep))
        female_words = sum(map(word_search, female_prep))
        neuter_words = sum(map(word_search, neuter_prep))

        if neuter_words > male_words and neuter_words > female_words:
            return 'neuter'
        elif male_words >= female_words:
            return 'masculine'
        else:
            return 'feminine'
예제 #11
0
def sort_by_length(*args):
    key = lambda a: len(' '.join(get_article(a).paragraphs()))
    return sorted(args, reverse=True, key=key)
예제 #12
0
 def get_types(self, symbol):
     types = get_article(symbol).types()
     return lispify(types)
예제 #13
0
 def get_classes(self, symbol):
     return lispify(get_article(symbol).classes())
예제 #14
0
 def get_categories(self, symbol):
     categories = get_article(symbol).categories()
     return lispify(categories)
예제 #15
0
 def test_article(self):
     art = Article(self.symbol)
     self.assertIs(Article, type(util.get_article(self.symbol)))
     self.assertIs(Article, type(util.get_article(art)))