Exemplo n.º 1
0
def test_connection_error(bot_helper, responses):
    # Check our assumptions: should be connection error because "responses" library is mocking the internet
    with pytest.raises(requests.ConnectionError):
        simple_http_get('http://example.com/foo/bar')
    # Should result in an error message from linkinfo (and implicitly no exception raised)
    result = bot_helper['linkinfo'].get_link_info('http://example.com/foo/bar')
    assert result.is_error
Exemplo n.º 2
0
def get_yt_json(vid_id):
    """Gets the (vaguely) relevant parts of the raw json from youtube.
    """

    # v=2 needed for like count
    url = "https://gdata.youtube.com/feeds/api/videos/{}?alt=json&v=2".format(vid_id)
    httpdata = simple_http_get(url)
    if httpdata.status_code != requests.codes.ok:
        return None

    return httpdata.json()["entry"]
Exemplo n.º 3
0
    def search_hoogle(self, e):
        """Search Hoogle with a given string and return the first few
        (exact number configurable) results.
        """

        query = e['data']
        hurl = 'http://www.haskell.org/hoogle/?mode=json&hoogle=' + query
        hresp = simple_http_get(hurl)

        if hresp.status_code != requests.codes.ok:
            self.log.warn('request failed for ' + hurl)
            return

        # The Hoogle response JSON is of the following format:
        # {
        #  "version": "<hoogle version>"
        #  "results": [
        #    {
        #      "location": "<link to docs>"
        #      "self":     "<name> :: <type>"
        #      "docs":     "<short description>"
        #    },
        #    ...
        #  ]
        # }

        maxresults = int(self.config_get('results'))

        if hresp.json is None:
            self.log.warn('invalid JSON received from Hoogle')
            return

        if 'parseError' in hresp.json():
            e.reply(hresp.json()['parseError'].replace('\n', ' '))
            return

        allresults = hresp.json()['results']
        totalresults = len(allresults)
        results = allresults[0:maxresults]
        niceresults = []

        for result in results:
            niceresults.append(result['self'])

        encqry = urllib.parse.quote(query.encode('utf-8'))
        fullurl = 'http://www.haskell.org/hoogle/?hoogle=' + encqry

        e.reply('Showing {} of {} results: {} ({})'.format(
            maxresults if maxresults < totalresults else totalresults,
            totalresults,
            '; '.join(niceresults),
            fullurl))
Exemplo n.º 4
0
def get_info(number=None):
    """Gets the json data for a particular comic
    (or the latest, if none provided).
    """
    if number:
        url = "http://xkcd.com/{}/info.0.json".format(number)
    else:
        url = "http://xkcd.com/info.0.json"

    httpdata = simple_http_get(url)
    if httpdata.status_code != requests.codes.ok:
        return None

    # Only care about part of the data
    httpjson = httpdata.json()
    data = {key: httpjson[key] for key in ["title", "alt", "num"]}

    # Unfuck up unicode strings
    data = fix_json_unicode(data)

    data["url"] = "http://xkcd.com/" + str(data["num"])
    return data
Exemplo n.º 5
0
    def scrape_html_title(self, url):
        """Scrape the ``<title>`` tag contents from an HTML page.
        """
        # Let's see what's on the other end...
        r = simple_http_get(url.geturl())
        # Only bother with 200 OK
        if r.status_code != requests.codes.ok:
            self.log.debug('request failed for ' + url.geturl())
            return None
        if 'html' not in r.headers['Content-Type']:
            self.log.debug('Content-Type not HTML-ish ({}): {}'
                           .format(r.headers['Content-Type'], url.geturl()))
            return None

        # Attempt to scrape the HTML for a <title>
        if 'charset=' in r.headers['content-type']:
            # If present, HTTP Content-Type header charset takes precedence
            parser = lxml.html.HTMLParser(
                encoding=r.headers['content-type'].rsplit('=', 1)[1])
        else:
            parser = lxml.html.html_parser
        html = lxml.etree.fromstring(r.content, parser)
        title = html.find('.//title')

        if title is None:
            self.log.debug('failed to find <title>: ' + url.geturl())
            return None

        # Normalise title whitespace
        title = ' '.join(title.text.strip().split())
        nsfw = url.netloc.endswith('.xxx')

        # See if the title is in the URL
        if self._filter_title_in_url(url, title):
            return None

        # Return the scraped title
        return 'Title', nsfw, '"{}"'.format(title)