Python xhtml2text示例，pipobot.lib.utils.xhtml2text Python示例

示例#1

0

显示文件

文件： bandm_lib.py 项目： VinDuv/pipobot-modules

def lyrics():
    """ Extracts lyrics from the current sont in 'bide et musique' """
    res = ""
    page = urllib.urlopen(HOME_PAGE)
    content = page.read()
    page.close()
    soup = BeautifulSoup(content)
    souptitle = soup.findAll("p", {"class": "titre-song"})[0]
    title = souptitle.text
    artist = soup.findAll("p", {"class": "titre-song2"})[0].text
    souptitle = soup.findAll("p", {"class": "titre-song"})[0]
    url = "http://www.bide-et-musique.com"
    url = "%s%s" % (url, souptitle.a.get("href"))
    page = urllib.urlopen(url)
    content = page.read()
    page.close()
    soup = BeautifulSoup(content)
    tab = soup.findAll("td", {"class": "paroles"})
    if tab == []:
        res = "Pas de paroles disponibles pour %s de %s" % (artist, title)
    else:
        tab = tab[0].contents
        res = "%s - %s\n%s\n" % (artist, title, "*" * 30)
        lyrics_content = ""
        for elt in tab:
            tmp = elt
            if str(tmp).lstrip() != "<br />":
                lyrics_content += xhtml2text(unicode(tmp).lstrip()) + "\n"
        res += lyrics_content
    return xhtml2text(res)

示例#2

0

显示文件

def lyrics():
    """ Extracts lyrics from the current sont in 'bide et musique' """
    res = ""
    page = urllib.urlopen(HOME_PAGE)
    content = page.read()
    page.close()
    soup = BeautifulSoup(content)
    souptitle = soup.findAll("p", {"class": "titre-song"})[0]
    title = souptitle.text
    artist = soup.findAll("p", {"class": "titre-song2"})[0].text
    souptitle = soup.findAll("p", {"class": "titre-song"})[0]
    url = "http://www.bide-et-musique.com"
    url = "%s%s" % (url, souptitle.a.get("href"))
    page = urllib.urlopen(url)
    content = page.read()
    page.close()
    soup = BeautifulSoup(content)
    tab = soup.findAll("td", {"class": "paroles"})
    if tab == []:
        res = "Pas de paroles disponibles pour %s de %s" % (artist, title)
    else:
        tab = tab[0].contents
        res = "%s - %s\n%s\n" % (artist, title, "*" * 30)
        lyrics_content = ""
        for elt in tab:
            tmp = elt
            if str(tmp).lstrip() != "<br />":
                lyrics_content += xhtml2text(unicode(tmp).lstrip()) + "\n"
        res += lyrics_content
    return xhtml2text(res)

示例#3

0

显示文件

文件： parser.py 项目： bok/pipobot

def extract(divclasse,LOCAL = False):
    if LOCAL:
        f = open(divclasse[1])
    else:
       f = urllib.urlopen(divclasse[1]) 
    content = f.read()
    f.close()
    soup = BeautifulSoup(content)
    grid = soup.findAll("div", {"class": divclasse[0]})
    res = {}
    for elt in grid:
        for eltli in elt.findAll("li"):
            chaine = eltli.first("span", {"class":"txtLogoChaine"})
            try:
                chaine = chaine.getText().partition("Programme ")[2]
            except AttributeError:
                continue
            soiree = ""    
            for eltp in eltli.findAll("p"):
                a = eltp.findAll("a")
                try:
                    title = a[0].get("title")
                except IndexError:
                    continue
                span = eltp.findAll("span")
                hour = span[0].getText()
                soiree += "%s : %s "%(hour, title)
            chaine = xhtml2text(chaine).lower()
            soiree = xhtml2text(soiree)
            res[chaine] = soiree
    return res

示例#4

0

显示文件

文件： parser.py 项目： nightmared/pipobot-modules

def extract(divclasse, local=False):
    if local:
        f = open(divclasse[1])
    else:
        f = urllib.urlopen(divclasse[1])
    content = f.read()
    f.close()
    soup = BeautifulSoup(content)
    grid = soup.findAll("div", {"class": divclasse[0]})
    res = {}
    for elt in grid:
        for eltli in elt.findAll("li"):
            chaine = eltli.first("span", {"class": "txtLogoChaine"})
            try:
                chaine = chaine.getText().partition("Programme ")[2]
            except AttributeError:
                continue
            soiree = ""
            for eltp in eltli.findAll("p"):
                a = eltp.findAll("a")
                try:
                    title = a[0].get("title")
                except IndexError:
                    continue
                span = eltp.findAll("span")
                hour = span[0].getText()
                soiree += "%s : %s " % (hour, title)
            chaine = xhtml2text(chaine).lower()
            soiree = xhtml2text(soiree)
            res[chaine] = soiree
    return res

示例#5

0

显示文件

文件： bandm_lib.py 项目： VinDuv/pipobot-modules

def current():
    """ Returns current track """
    page = urllib.urlopen(PLAYLIST)
    content = page.read(1500)
    page.close()
    soup = BeautifulSoup(content)
    try:
        return xhtml2text(xhtml2text(soup.findAll("title")[1].text.partition(": ")[2]))
    except:
        return "HTML parsing failed !"

示例#6

0

显示文件

def current():
    """ Returns current track """
    page = urllib.urlopen(PLAYLIST)
    content = page.read(1500)
    page.close()
    soup = BeautifulSoup(content)
    try:
        return xhtml2text(xhtml2text(soup.findAll("title")[1].text.partition(": ")[2]))
    except:
        return "HTML parsing failed !"

示例#7

0

显示文件

文件： __init__.py 项目： nightmared/pipobot-modules

    def extract_data(self, html_content):
        """ Extracts a bashorg quote given the HTML code of the page """
        soup = BeautifulSoup(html_content)
        sections = soup.findAll("p", {"class": "qt"})
        if sections == []:
            return "The quote does not exist !"

        tables = soup.findAll("table")
        for elt in tables:
            p = elt.findAll("p", {"class": "qt"})
            if p != []:
                content = xhtml2text(unicode(p[0]))
                nb = xhtml2text(unicode(elt.findAll("b")[0].text))
                break

        return "bashorg %s :\n%s" % (nb, content)

示例#8

0

显示文件

 def answer(self, sender, message):
     url = "http://fr.wiktionary.org/w/api.php?action=query&list=search&format=json&srsearch=%s&srlimit=10"
     page = urllib.urlopen(url % message)
     content = page.read()
     page.close()
     js = json.loads(content)
     try:
         snippet = xhtml2text(js["query"]["search"][0]["snippet"])
         # Removing prononciation
         clean = snippet.replace("  ", " ")
         return clean if clean != "" else NOT_FOUND
     except (KeyError, IndexError):
         return NOT_FOUND

示例#9

0

显示文件

文件： __init__.py 项目： nim65s/pipobot-modules

 def answer(self, sender, message):
     url = "http://fr.wiktionary.org/w/api.php?action=query&list=search&format=json&srsearch=%s&srlimit=10"
     page = urllib.urlopen(url % message)
     content = page.read()
     page.close()
     js = json.loads(content)
     try:
         snippet = xhtml2text(js["query"]["search"][0]["snippet"])
         # Removing prononciation
         clean = snippet.replace("  ", " ")
         return clean if clean != "" else NOT_FOUND
     except (KeyError, IndexError):
         return NOT_FOUND

示例#10

0

显示文件

文件： __init__.py 项目： VinDuv/pipobot-modules

def html_request(msg):
    site = urllib.urlopen('http://duckduckgo.com/html/?q=%s' % msg)
    data = site.read()
    soup = BeautifulSoup.BeautifulSoup(data)
    site.close()

    links = soup.findAll('div', {'class': "links_main links_deep"})
    results = ""
    for link in links[:MAX_RESULT]:
        url = link.find("a").get("href")
        contents = link.find("a").contents
        title = ""
        for data in contents:
            if isinstance(data, BeautifulSoup.Tag):
                title += " %s" % data.getString()
            else:
                title += " %s" % str(xhtml2text(data))
        results += "%s - %s\n" % (title.strip(), url)

    return results.strip()

示例#11

0

显示文件

文件： __init__.py 项目： nightmared/pipobot-modules

def html_request(msg):
    site = urllib.urlopen('http://duckduckgo.com/html/?q=%s' % msg)
    data = site.read()
    soup = BeautifulSoup.BeautifulSoup(data)
    site.close()

    links = soup.findAll('div', {'class': "links_main links_deep"})
    results = ""
    for link in links[:MAX_RESULT]:
        url = link.find("a").get("href")
        contents = link.find("a").contents
        title = ""
        for data in contents:
            if isinstance(data, BeautifulSoup.Tag):
                title += " %s" % data.getString()
            else:
                title += " %s" % str(xhtml2text(data))
        results += "%s - %s\n" % (title.strip(), url)

    return results.strip()

示例#12

0

显示文件

文件： bandm_lib.py 项目： VinDuv/pipobot-modules

def parse_one_track(soup_track):
    """ Extracts infos of a track from HTML code of b&m pages """
    artist, title = soup_track.findAll("td", {"class": "baseitem"})
    tmp = "%s - %s" % (artist.text, title.text)
    return xhtml2text(tmp)

示例#13

0

显示文件

文件： bandm_lib.py 项目： VinDuv/pipobot-modules

def parse_one_show(soup_show):
    """ Extracts data from a program in bide et musique """
    tds = soup_show.findAll("td")
    hour = xhtml2text(tds[0].text)
    name = xhtml2text(tds[1].text)
    return "%s - %s" % (hour, name)

示例#14

0

显示文件

 def extract_data(self, html_content):
     soup = BeautifulSoup(html_content)
     quote = soup.findAll('div', {"class": 'citation'})[0]
     return xhtml2text('\n'.join([p.contents[0] for p in quote.contents]))

示例#15

0

显示文件

def parse_one_track(soup_track):
    """ Extracts infos of a track from HTML code of b&m pages """
    artist, title = soup_track.findAll("td", {"class": "baseitem"})
    tmp = "%s - %s" % (artist.text, title.text)
    return xhtml2text(tmp)

示例#16

0

显示文件

def parse_one_show(soup_show):
    """ Extracts data from a program in bide et musique """
    tds = soup_show.findAll("td")
    hour = xhtml2text(tds[0].text)
    name = xhtml2text(tds[1].text)
    return "%s - %s" % (hour, name)