Python beautify示例，surllib.beautify Python示例

示例#1

0

显示文件

def wpTrimPlan(bs):
    '''Trim HTML to only contain title + main table'''
    b = surllib.beautify('')
    title = bs.find('h2')
    b.append(title)

    maint = [t for t in bs.findAll('table') if len(t.findAll('tr')) > 2][0]
    b.append(maint)

    return b

示例#2

0

显示文件

文件： pgWeekplans.py 项目： agraae/fskintra

def wpTrimPlan(bs):
    '''Trim HTML to only contain title + main table'''
    b = surllib.beautify('')
    title = bs.find('h2')
    b.append(title)

    maint = [t for t in bs.findAll('table') if len(t.findAll('tr')) > 2][0]
    b.append(maint)

    return b

示例#3

0

显示文件

文件： pgDialogue.py 项目： agraae/fskintra

def diaFindMessages(data):
    '''Find all messages in the html data, return True iff at least one
    email was sent'''
    bs = surllib.beautify(data)

    atags = bs.findAll('a')
    newMsgFound = False
    for atag in atags:
        href = atag['href']

        if not href.startswith('VisBesked'):
            continue  # ignore

        title = atag.text.strip()
        if not title:
            continue  # ignore (this is the envelope icon)

        lurl = 'https://%s%s%s' % (config.HOSTNAME, URL_PREFIX, href)
        mid = re.search('Id=(\\d+)', href).group(1)

        if diaExamineMessage(lurl, mid):
            newMsgFound = True
    return newMsgFound

示例#4

0

显示文件

文件： pgDialogue.py 项目： webjay/fskintra

def diaFindMessages(data):
    '''Find all messages in the html data, return True iff at least one
    email was sent'''
    bs = surllib.beautify(data)

    atags = bs.findAll('a')
    newMsgFound = False
    for atag in atags:
        href = atag['href']

        if not href.startswith('VisBesked'):
            continue  # ignore

        title = atag.text.strip()
        if not title:
            continue  # ignore (this is the envelope icon)

        lurl = 'https://%s%s%s' % (config.HOSTNAME, URL_PREFIX, href)
        mid = re.search('Id=(\\d+)', href).group(1)

        if diaExamineMessage(lurl, mid):
            newMsgFound = True
    return newMsgFound

示例#5

0

显示文件

文件： pgDocuments.py 项目： svalgaard/fskintra

def docFindDocuments(bs, foldername="Dokumentarkiv"):
    """Input beatifulsoup with content from a page of documents
    Looks at this and all subfolders, and sends any new messages"""

    trs = bs.findAll("tr")

    for line in trs:
        if not line.has_key("class"):
            continue
        if not [c for c in line["class"].split() if c.startswith("linje")]:
            continue

        links = line.findAll("a")
        assert len(links) >= 2

        # find file type
        ext = links[0].img["src"].split("/")[-1][2:-4].lower()

        # find name of file
        title = links[1].text
        ltitle = foldername + " / " + title

        # find url
        url = links[0]["href"]
        config.log(u"Kigger på dokument url: %s" % url, 3)
        m = re.match(r"javascript:visdokument\((\d+),'([^']+)'\).*", url)
        if m:
            url = m.group(2)
        elif "visdokument" in url.lower():
            url = URL_DOC + re.search(".*?(\d+)", links[0]["href"]).group(1)
        elif links[0].has_key("onclick") and "visdok" in links[0]["onclick"]:
            url = url  # href is actually the file url
        else:
            assert "Dokliste" in url
        url = urllib.quote(url.encode("iso-8859-1"), safe=":/?=&%")

        # find date
        dts = line.findAll("td", width="18%")
        assert len(dts) == 1 and dts[0].text  # exactly one date
        date = dts[0].text

        # now do stuff
        if "Dokliste" in url:
            # this is a subfolder

            # first look at (potentially cached version)
            suburl = URL_PREFIX + url
            subbs = surllib.skoleGetURL(suburl, True)

            subdate = datetime.date(*reversed(map(int, date.split("-"))))
            if subbs.cachedate <= subdate or subbs.cacheage >= 1.9:
                # cached version is too old - refetch
                subbs = surllib.skoleGetURL(suburl, True, True)
                config.log(u"Kigger på folderen %s" % title)
            else:
                config.log(u"Kigger på folderen %s (fra cache)" % title)

            docFindDocuments(subbs, ltitle)
        else:
            # this is an actual document
            config.log(u"Kigger på dokumentet %s" % ltitle)

            # Create HTML snippet
            html = u"<p>Nyt dokument: <a href=''>%s</a></p>" % ltitle
            h = surllib.beautify(html)
            h.a["href"] = url
            h.a["usefilename"] = title + "." + ext

            msg = semail.Message("documents", h)
            msg.setTitle(u"%s" % title)
            msg.setDate(date)
            msg.maybeSend()

示例#6

0

显示文件

def docFindDocuments(bs, foldername='Dokumentarkiv'):
    '''Input beatifulsoup with content from a page of documents
    Looks at this and all subfolders, and sends any new messages'''

    trs = bs.findAll('tr')

    for line in trs:
        if not line.has_key('class'):
            continue
        if not [c for c in line['class'].split() if c.startswith('linje')]:
            continue

        links = line.findAll('a')
        assert(len(links) >= 2)

        # find file type
        ext = links[0].img['src'].split('/')[-1][2:-4].lower()

        # find name of file
        title = links[1].text
        ltitle = foldername + ' / ' + title

        # find url
        url = links[0]['href']
        if 'visDokument' in url:
            url = URL_DOC + re.search('.*?(\d+)', links[0]['href']).group(1)
        else:
            assert('Dokliste' in url)
        url = urllib.quote(url.encode('iso-8859-1'), safe=':/?=&%')

        # find date
        dts = line.findAll('td', width='18%')
        assert(len(dts) == 1 and dts[0].text)  # exactly one date
        date = dts[0].text

        # now do stuff
        if 'Dokliste' in url:
            # this is a subfolder

            # first look at (potentially cached version)
            suburl = URL_PREFIX + url
            subbs = surllib.skoleGetURL(suburl, True)

            subdate = datetime.date(*reversed(map(int, date.split('-'))))
            if subbs.cachedate <= subdate or \
               (datetime.date.today() - subbs.cachedate).days > 2:
                # cached version is too old - refetch
                subbs = surllib.skoleGetURL(suburl, True, True)
                config.log(u'Kigger på folderen %s' % title)
            else:
                config.log(u'Kigger på folderen %s (fra cache)' % title)

            docFindDocuments(subbs, ltitle)
        else:
            # this is an actual document
            config.log(u'Kigger på dokumentet %s' % ltitle)

            # Create HTML snippet
            html = u"<p>Nyt dokument: <a href=''>%s</a></p>" % ltitle
            h = surllib.beautify(html)
            h.a['href'] = url
            h.a['usefilename'] = title + '.' + ext

            msg = semail.Message('documents', h)
            msg.setTitle(u'%s' % title)
            msg.setDate(date)
            msg.maybeSend()