def d(a, b): if function == "lcs": #https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python_2 m = [[0] * (1 + len(b)) for i in xrange(1 + len(a))] longest, x_longest = 0, 0 for x in xrange(1, 1 + len(a)): for y in xrange(1, 1 + len(b)): if a[x - 1] == b[y - 1]: m[x][y] = m[x - 1][y - 1] + 1 if m[x][y] > longest: longest = m[x][y] x_longest = x else: m[x][y] = 0 return len(a[x_longest - longest:x_longest]) # used to be without len( ) if function == "lenS": return abs(len(a) - len(b)) if function == "d&l": import re lettersa = len(re.findAll("[a-zA-Z]", a)) numbersa = len(re.findAll("[0-9]", a)) lettersb = len(re.findAll("[a-zA-Z]", b)) numbersb = len(re.findAll("[0-9]", b)) return abs(lettersa - lettersb) + abs(numbersa - numbersb) if function == "edit": short = min(len(a), len(b)) diff = max(len(a), len(b)) - short for i in range(short): if a[i] != b[i]: diff += 1 return diff
def ScrapePage2(browseTo): trail = list() try: page = urllib.request.urlopen(browseTo) pageContents = page.read().decode("utf-8") except: return matchObj = re.findall( r'href=[\'"]?([^\'" >]+)', pageContents, 0) lookObj = re.findAll(lookingFor, pageContents, 0) if(len(lookObj) > 0) print("Ding! on " + browseTo) for i in matchObj: goodUrl = urljoin(browseTo, i) if goodUrl not in urls and domain == urlparse(goodUrl).hostname: trail.append(goodUrl) urls[goodUrl] = goodUrl#urls.append(goodUrl) while len(trail) > 0: e = trail.pop() print(e) try: page = urllib.request.urlopen(e) pageContents = page.read().decode("utf-8") except: continue matchObj = re.findall( r'href=[\'"]?([^\'" >]+)', pageContents, 0) for i in matchObj: goodUrl = urljoin(e, i) if goodUrl not in urls and domain == urlparse(goodUrl).hostname: trail.append(goodUrl) urls[goodUrl] = goodUrl#urls.append(goodUrl)
def getContent(self, page): pattern = re.compile('<div id="post_content_.*?>(.*?)</div>', re.S) items = re.findAll(pattern, page) contents = [] for item in items: content = "\n" + self.tool.replace(item) + "\n" contents.append(content.encode('utf-8')) return contents
def getContent(self, page): pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S) items = re.findAll(pattern, page) contents = [] for item in items: content = "\n" + self.tool.replace(item) + "\n" contents.append(content.encode('utf-8')) return contents
def indentDepth(self, whiteSpace): #Make sure there is no mixing of tabs and spaces assert (self.indentToken != "") if (self.indentToken == "\t"): assert (" " not in whiteSpace) else: assert ("\t" not in whiteSpace) return len(re.findAll(self.indentToken, whiteSpace))
def indentDepth(self, whiteSpace): #Make sure there is no mixing of tabs and spaces assert(self.indentToken != "") if(self.indentToken == "\t"): assert(" " not in whiteSpace) else: assert("\t" not in whiteSpace) return len(re.findAll(self.indentToken, whiteSpace))
def getAllBoardsForUser(self , refName): time.sleep(1) resp = self.br.open("http://pinterest.com/%s/"%refName) con = resp.read() jsonBoard="""var myboards =(.*?[^]]?)]""" m = re.findAll( re.compile(jsonBoard), con ) print m content = json.loads(request.read()).get("url") pinID = string.replace( content , "/pin/" , "" ).replace("/" , "") return pinID
def processString(self, string, page): """ Given a string chunk form a file runs Reg Exp to detect private info and append a marker to the list 'strings' """ # Regex for proper names regex = '[A-Z][a-zA-Z|.]+' patt = re.compile(regex) matches = re.findAll(patt, string)
def mmInfo(self, path): content = self.gate.page_source.encode('utf-8') print("妹子信息页面:") print(content) pattern = re.compile('.*?<ul class="mm-p-info-cell clearfix">'+ '.*?<li class=".*?<label>(.*?)</label><span>(.*?)'+ '</span></li>.*?</ul>.*?', re.S) infoPairs = re.findAll(pattern, content) fo = open(path+'info.txt', 'w') for i in range(1,len(infoPairs)-1): key = infoPairs[i] value = infoPairs[i+1] i=i+1 fo.write(key) fo.write(value + '\n') fo.close()
def global_reg(pattern,content): return re.findAll(pattern,content,re.S)
def counter(content): num_line = re.findAll(PATTERN_LINE, content) num_block = re.findAll(PATTERN_BLOCK, content) return len(num_line) + len(num_block)
import urllib,re from urllib import urlopen f =urlopen("https://www.justdial.com/Delhi/Baby-Sitters-in-Rohini/nct-10031239") s = f.read() print(re.findAll(r"\+\d{2}\s?0?\d{10}",s)) print(re.findAll(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}",s))
from bs4 import BeautifulSoup as parse from Calendar import Calendar OUTPUT_FILE_NAME = 'iCal.ics' GOOGLE_OUTPUT_FILE_NAME = 'iCal_Google.ics' DOMAIN = 'http://www.techvibes.com' URL = DOMAIN + '/event/vancouver' page = openPage(URL) html = parse(page) pageList = html.find(class_ = 'pagination') try: pageIndex = pageList.find(class_ = 'active').a['title'] numberOfPages = int(findAll('Page 1 of (\d+)', pageIndex)[0]) except AttributeError: numberOfPages = 1 calendar = Calendar() for pageNumber in range(1, numberOfPages + 1): print 'Page', pageNumber url = URL + '/' + str(pageNumber) page = openPage(url) html = parse(page) content = html.find(id = 'content') eventListings = content.find_all(class_ = 'event')
import re s = "desc=(ProxyGoodsAnalysisServiceImpl.getGoodsTop)异常" print(s) re.findAll(r"desc=(.+?)", s)
import urllib.parse import urllib.request import re response = urllib.request.urlopen( "https://in.finance.yahoo.com/quote/AAPL?p=AAPL") html = response.read() regex = '<span class="Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)" data-reactid="247">(.+?)</span>' pattern = re.compile(regex) ht = html.decode('utf-8') price = re.findAll(pattern, ht) print(price)
def numOfProds( self ): prods = self.soup.findAll( 'div', 'pag-n-to-n-txt' ) rg = 'of ([0-9]+)' num = int(re.findAll( rg, prods[0].text )[0])
def global_reg(pattern, content): return re.findAll(pattern, content, re.S)
import requests import re #好饿啊,什么时候开饭 class test: def __init__(self): print("初始化jxhczzz") def getHtmlText(self, url): try: html = requests.get(url).text except: return None def resultList(self, html): for a in html: print(a) a = test() html = a.getHtmlText("http://www.baidu.com") resultlist = re.findAll(r'\d{3}-\d{7,8}(-\d{\d}?)') reslt = a.resultList(resultlist) print("2019-08-17")