# getNTPserversNew.py # # Demonstration of the parsing module, implementing a HTML page scanner, # to extract a list of NTP time servers from the NIST web site. # # Copyright 2004-2010, by Paul McGuire # September, 2010 - updated to more current use of setResultsName, new NIST URL # from pyparsingOD import Word, Combine, Suppress, SkipTo, nums, makeHTMLTags, delimitedList, alphas, alphanums import urllib.request, urllib.parse, urllib.error integer = Word(nums) ipAddress = Combine(integer + "." + integer + "." + integer + "." + integer) hostname = delimitedList(Word(alphas, alphanums + "-_"), ".", combine=True) tdStart, tdEnd = makeHTMLTags("td") timeServerPattern = ( tdStart + hostname("hostname") + tdEnd + tdStart + ipAddress("ipAddr") + tdEnd + tdStart + SkipTo(tdEnd)("loc") + tdEnd ) # get list of time servers nistTimeServerURL = "http://tf.nist.gov/tf-cgi/servers.cgi#" serverListPage = urllib.request.urlopen(nistTimeServerURL) serverListHTML = serverListPage.read()
from pyparsingOD import makeHTMLTags, SkipTo, htmlComment import urllib.request, urllib.parse, urllib.error serverListPage = urllib.request.urlopen("http://www.yahoo.com") htmlText = serverListPage.read() serverListPage.close() aStart, aEnd = makeHTMLTags("A") link = aStart + SkipTo(aEnd).setResultsName("link") + aEnd link.ignore(htmlComment) for toks, start, end in link.scanString(htmlText): print(toks.link, "->", toks.startA.href)
import urllib.request, urllib.parse, urllib.error from pyparsingOD import makeHTMLTags, SkipTo # read HTML from a web page serverListPage = urllib.request.urlopen("http://www.yahoo.com") htmlText = serverListPage.read() serverListPage.close() # using makeHTMLTags to define opening and closing tags anchorStart, anchorEnd = makeHTMLTags("a") # compose an expression for an anchored reference anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd # use scanString to scan through the HTML source, extracting # just the anchor tags and their associated body text # (note the href attribute of the opening A tag is available # as an attribute in the returned parse results) for tokens, start, end in anchor.scanString(htmlText): print(tokens.body, '->', tokens.href)
# URL extractor # Copyright 2004, Paul McGuire from pyparsingOD import Literal,Suppress,CharsNotIn,CaselessLiteral,\ Word,dblQuotedString,alphanums,SkipTo,makeHTMLTags import urllib.request, urllib.parse, urllib.error import pprint # Define the pyparsing grammar for a URL, that is: # URLlink ::= <a href= URL>linkText</a> # URL ::= doubleQuotedString | alphanumericWordPath # Note that whitespace may appear just about anywhere in the link. Note also # that it is not necessary to explicitly show this in the pyparsing grammar; by default, # pyparsing skips over whitespace between tokens. linkOpenTag, linkCloseTag = makeHTMLTags("a") link = linkOpenTag + SkipTo(linkCloseTag).setResultsName( "body") + linkCloseTag.suppress() # Go get some HTML with some links in it. serverListPage = urllib.request.urlopen("http://www.google.com") htmlText = serverListPage.read() serverListPage.close() # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). for toks, strt, end in link.scanString(htmlText): print(toks.startA.href, "->", toks.body) # Create dictionary from list comprehension, assembled from each pair of tokens returned # from a matched URL. pprint.pprint(
# URL extractor # Copyright 2004, Paul McGuire from pyparsingOD import Literal,Suppress,CharsNotIn,CaselessLiteral,\ Word,dblQuotedString,alphanums,SkipTo,makeHTMLTags import urllib.request, urllib.parse, urllib.error import pprint # Define the pyparsing grammar for a URL, that is: # URLlink ::= <a href= URL>linkText</a> # URL ::= doubleQuotedString | alphanumericWordPath # Note that whitespace may appear just about anywhere in the link. Note also # that it is not necessary to explicitly show this in the pyparsing grammar; by default, # pyparsing skips over whitespace between tokens. linkOpenTag,linkCloseTag = makeHTMLTags("a") link = linkOpenTag + SkipTo(linkCloseTag).setResultsName("body") + linkCloseTag.suppress() # Go get some HTML with some links in it. serverListPage = urllib.request.urlopen( "http://www.google.com" ) htmlText = serverListPage.read() serverListPage.close() # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). for toks,strt,end in link.scanString(htmlText): print(toks.startA.href,"->",toks.body) # Create dictionary from list comprehension, assembled from each pair of tokens returned # from a matched URL. pprint.pprint( dict( [ (toks.body,toks.startA.href) for toks,strt,end in link.scanString(htmlText) ] )
import urllib.request, urllib.parse, urllib.error from pyparsingOD import makeHTMLTags, SkipTo # read HTML from a web page serverListPage = urllib.request.urlopen( "http://www.yahoo.com" ) htmlText = serverListPage.read() serverListPage.close() # using makeHTMLTags to define opening and closing tags anchorStart,anchorEnd = makeHTMLTags("a") # compose an expression for an anchored reference anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd # use scanString to scan through the HTML source, extracting # just the anchor tags and their associated body text # (note the href attribute of the opening A tag is available # as an attribute in the returned parse results) for tokens,start,end in anchor.scanString(htmlText): print(tokens.body,'->',tokens.href)
# getNTPserversNew.py # # Demonstration of the parsing module, implementing a HTML page scanner, # to extract a list of NTP time servers from the NIST web site. # # Copyright 2004-2010, by Paul McGuire # September, 2010 - updated to more current use of setResultsName, new NIST URL # from pyparsingOD import (Word, Combine, Suppress, SkipTo, nums, makeHTMLTags, delimitedList, alphas, alphanums) import urllib.request, urllib.parse, urllib.error integer = Word(nums) ipAddress = Combine( integer + "." + integer + "." + integer + "." + integer ) hostname = delimitedList(Word(alphas,alphanums+"-_"),".",combine=True) tdStart,tdEnd = makeHTMLTags("td") timeServerPattern = (tdStart + hostname("hostname") + tdEnd + tdStart + ipAddress("ipAddr") + tdEnd + tdStart + SkipTo(tdEnd)("loc") + tdEnd) # get list of time servers nistTimeServerURL = "http://tf.nist.gov/tf-cgi/servers.cgi#" serverListPage = urllib.request.urlopen( nistTimeServerURL ) serverListHTML = serverListPage.read() serverListPage.close() addrs = {} for srvr,startloc,endloc in timeServerPattern.scanString( serverListHTML ): print("%s (%s) - %s" % (srvr.ipAddr, srvr.hostname.strip(), srvr.loc.strip())) addrs[srvr.ipAddr] = srvr.loc