def load(self, address): try: web_handle = urllib2.urlopen(address) web_text = web_handle.read() # matches = sre.findall('\<td class="pl"\>(.*?)\&', web_text) # matches = sre.findall('\>(.*?)\ \<', web_text) date_match = sre.findall('(\d{1,2}\-\d{1,2}\-\d{2})', web_text) lines = sre.findall('\<td class="plleft"\>(.*?)\</td\>\</tr\>', \ web_text) if (date_match != []): date = date_match[1]; date = datetime.strptime(date, "%m-%d-%y") date = date.isocalendar() for line in lines: artist = "" song = "" album = "" matches = sre.findall('\<td class="pl"\>(.*?)\ ', line) tracker = 1 playlist = True for match in matches: if tracker == 1: artist = match tracker = 2 elif tracker == 2: song = match tracker = 3 elif tracker == 3: album = match self.add_song(artist, song, album, date) tracker = 4 elif tracker ==4: tracker =1 else: print "Wtf this shouldn't happen." else: playlist = False pass print "No playlist checkpoint 1" return playlist except urllib2.HTTPError, e: print "Cannot retreieve URL: HTTP Error Code", e.code
def online(self, word_to_find): setdefaulttimeout(5) website = urlopen(Request(web_dict + word_to_find)).read() if findall('<META NAME="ROBOTS" CONTENT="NOINDEX,FOLLOW">', website): return "LMGTFY " + google_search + word_to_find else: return "Try " + web_dict + word_to_find
def getgranularity(formattype): """returns the granularity range available from the given formattype""" # TODO: include all the formatting codes, weeks, etc year, month, day, hour, minute, second = range(6) finestgranularity = year widestgranularity = second for formatstr in sre.findall("%[a-zA-Z]", formattype): formatcode = formatstr[1] if formatcode in "jyY": codegranularity = year elif formatcode in "bBm": codegranularity = month elif formatcode in "aAdj": codegranularity = day elif formatcode in "HIp": codegranularity = hour elif formatcode in "M": codegranularity = minute elif formatcode in "S": codegranularity = second if codegranularity > finestgranularity: finestgranularity = codegranularity if codegranularity < widestgranularity: widestgranularity = codegranularity return finestgranularity, widestgranularity
def get_data(): try: website = urllib2.urlopen(address) html = website.read() matches = sre.findall('<TD>[A-Za-z0-9\.]*', html) return matches except: return "Could not retrieve data."
def getModuleNameFromLine(word, line, column): # Take part of the line until column to make sure we don't get any matches after that. match = sre.findall(r'(?:[a-zA-Z0-9_]*\.)+'+word, line[:column]) if not match: # We're not completing a modulename, so we return None return None # To be sure it's the right match, we take the last one and strip off the . and the word result = match[-1][:-len("."+word)] return result
def harvest_page(url, body): global harvest_regexp, harvlogfile print "OMG HARVESTING ROFL" list = sre.findall(harvest_regexp, body) print url + ":", str(list) if len(list) > 0: fp = open("logs/" + harvlogfile, "a") fp.write(url + ": " + str(list) + "\n\n") fp.close() print "DONE WITH", url
def harvest_page(url,body): global harvest_regexp,harvlogfile print "OMG HARVESTING ROFL" list=sre.findall(harvest_regexp,body) print url+":",str(list) if len(list)>0: fp=open("logs/"+harvlogfile,"a") fp.write(url+": "+str(list)+"\n\n") fp.close() print "DONE WITH",url
def lyrics(artist, song): try: address = 'http://www.azlyrics.com/lyrics/' + \ artist.replace(' ', '').lower() + '/' + \ song.replace(' ', '').lower() + '.html' web_handle = urllib2.urlopen(address) web_text = web_handle.read() lyrics = sre.findall('(?s)<!-- start of lyrics -->.*?<!', web_text, sre.MULTILINE) return lyrics except urllib2.HTTPError, e: print "Cannot retreieve URL: HTTP Error Code", e.code
def printamp(line): #print '@unblock', line, '->', m = sre.findall('#AT', line) outline = line if not m: #print outline return (outline) else: for i in range(len(m)): outline = outline.replace('#AT', '@') #print outline return (outline)
def printamp(line): #print '@unblock', line, '->', m = sre.findall('#AT',line) outline = line if not m : #print outline return(outline) else: for i in range(len(m)): outline = outline.replace('#AT','@') #print outline return(outline)
def unblock(line): #print '@unblock', line, '->', m = sre.findall('@[^\s]+', line) outline = line if not m: #print outline return (outline) else: for i in range(len(m)): s = m[i].replace('@', '').replace('%X%', ' ') outline = outline.replace(m[i], s) #print outline return (outline)
def blocked(line): #print '@ blocked', line , '->', m = sre.findall('@[^@]+@', line) outline = line if not m: #print outline return (outline) else: for i in range(len(m)): s = m[i][:-1].replace(' ', '%X%') outline = outline.replace(m[i], s, 1) #print outline return (outline)
def unblock(line): #print '@unblock', line, '->', m = sre.findall('@[^\s]+',line) outline = line if not m : #print outline return(outline) else: for i in range(len(m)): s=m[i].replace('@','').replace('%X%',' ') outline = outline.replace(m[i],s) #print outline return(outline)
def blocked(line): #print '@ blocked', line , '->', m = sre.findall('@[^@]+@',line) outline = line if not m : #print outline return(outline) else: for i in range(len(m)): s=m[i][:-1].replace(' ','%X%') outline = outline.replace(m[i],s,1) #print outline return(outline)
def lyrics(self, artist, song): 'Enter artist and song to find lyrics on one of two online databases' song = song.replace(',','') try: begin = artist[0]+artist[1]+artist[2] if begin.lower() == 'the': artist = artist[4:] address = 'http://www.azlyrics.com/lyrics/' + \ artist.replace(' ', '').lower() + '/' + \ song.replace(' ', '').lower() + '.html' web_handle = urllib2.urlopen(address) web_text = web_handle.read() lyrics = sre.findall('(?s)<!-- start of lyrics -->(.*?)<!', web_text, sre.MULTILINE) lyrics = lyrics[0] lyrics.replace('\n', '') lyrics.replace('\r\n', '') return lyrics except urllib2.HTTPError, urllib2.URLError: # if not found, try different website try: begin = artist[0]+artist[1]+artist[2] if begin.lower() == 'the': artist = artist[4:] address = 'http://indierocklyrics.com/' + \ artist.replace(' ', '-').lower() + '/' + \ song.replace(' ', '-').lower() + '-lyrics' web_handle = urllib2.urlopen(address) web_text = web_handle.read() lyrics = sre.findall('(?s)<p>\ <br\ />(.*?)<a\ href\=\"http://www'\ , web_text, sre.MULTILINE) lyrics = lyrics[0] lyrics.replace('\n', '') lyrics.replace('\r\n', '') return lyrics except urllib2.HTTPError, e: print "Cannot retreieve URL: HTTP Error Code", e.code
def __parse_attribute(self, s): avlist = { } map(lambda x: avlist.update({x[0]: x[1]}), re.findall("(\w+)=\"([^\"]*)\"\s*", s)) return (avlist)
def scanSGF (target): if isdir(target): for sgf_file in os.listdir(target): if sgf_file!='Desktop': scanSGF(target+os.sep+sgf_file) elif isfile(target) and target[len(target)-4:]=='.sgf': try: #Check to see if there is already a result tag. At the moment, it will accept any sort #of result tag, but this is easily modified to replace result tags that are improper. if len(sre.findall('(RE\[.*\])',file(target,'r').read())) >0: print target+" already has a result. Skipping..." else: print target+":", next_move = sre.findall('([B|W])\[[a-z][a-z]\]',file(target,'r').read())[-1] #next_move looks inside the SGF to find the last player who made a move. This is so that later, the #GnuGo engine knows for which player to generate a move. if next_move=='W': next_move='black' else: next_move='white' #The black/white syntax is needed by the GTP protocol. gtp_test = file('gtp_test','w') gtp_test.write('reg_genmove '+next_move+'\ntop_moves\nquit') gtp_test.flush() #Although it would technically be possible to bind gnugo's STDIN and STDOUT to Python, it is just #so much simpler to put the commands in a file. The file is deleted later anyway. gnugo_session = os.popen('gnugo --mode gtp --gtp-input gtp_test -l '+target).read() if len(sre.findall('PASS',gnugo_session))>0: move_value = 0 #If GnuGo thinks the best move is to pass, then the game is obviously over, and setting #move_value to 0 will ensure that the game will later be given to GnuGo to estimate score. else: move_value = sre.findall('([\d\d|\d]\.[\d|\d\d])',gnugo_session)[0] #Since GnuGo will give the values of the move in reverse order that they are played, the #value of the most recent move (which we generated in gtp_test) will be the first one. #This is the value we want to check for size. if next_move=='black': next_move='W' else: next_move='B' #I am making an assumption here, that the last person to move is going to win the game. #It seems silly for a player to make a move and then resign, but it is not an impossibility. #Therefore, if you need that extra bit of accuracy, you can make some minor modifications #to check the estimated score regardless of whether the game ended in resign or not, and #use that as a sign of who won. game_result = next_move+'+R' if float(move_value)<2: #If the value of the move generated by GnuGo is less than 2, then it is clear that the late #endgame has been reached, and the game is probably over. In this case, we will use GnuGo #to calculate the relative score. result_string = os.popen('gnugo -l '+target+' --score estimate').read() winning_color = result_string[:1] score_estimate = sre.findall('(\d.\d)',result_string)[0] game_result = winning_color+'+'+score_estimate print game_result sgf_raw = file(target,'r') file_dump = sgf_raw.read() file_dump = sre.sub(RE_PREFIX,PREFIX+'RE['+game_result+']',file_dump) sgf_write=file(target,'w') sgf_write.write(file_dump) sgf_write.flush() sgf_write.close() os.remove('gtp_test') #Remove the old gtp_test. except IndexError: print "Error with SGF "+target+". Deleting ..." error_log = file('error_log','a') error_log.write("Error on "+target+". Deleting file.\n") error_log.flush() error_log.close() os.remove(target) #Uncomment previous line if you would like to keep illegal SGF's. except Exception: print "Error. Skipping ..." print Exception error_log = file('error_log','a') error_log.write("Error on "+target+". Skipping file.\n") error_log.flush() error_log.close() except Error: print "Error. Skipping ..." print Error error_log = file('error_log','a') error_log.write("Error on "+target+". Skipping file.\n") error_log.flush() error_log.close()
import sys import urllib2 import sre from bs4 import BeautifulSoup url = "http://www.2dehands.be/autos/?show_markt_uitvoer=1&locale=all&plaatsdatum__x=30&auto_bj__tot=1980" website = urllib2.urlopen(url) website_html = website.read() soup = BeautifulSoup(website_html) nice = soup.prettify() print nice matches = sre.findall('<a href="http://www.2dehands.be/autos/.*', nice) print matches for link in soup.find_all('a'): print(link.get('href')) # matches = sre.findall('<a href="http://www.2dehands.be/autos/.*', nice) # print matches
import sys import BaseHTTPServer import urllib2 import sre outf = open('texts3.txt','w') sys.stdout = outf; for i in range(20,30): sock = urllib2.urlopen("https://www.goodreads.com/quotes?page="+str(i)) htmlSource = sock.read() sock.close() #print htmlSource matches = sre.findall('“(.*?)”', htmlSource) for stri in matches: print stri
raise TestFailed, "sre.split" try: assert sre.split(":", ":a:b::c", 2) == ['', 'a', 'b::c'] assert sre.split(':', 'a:b:c:d', 2) == ['a', 'b', 'c:d'] assert sre.split("(:)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c'] assert sre.split("(:*)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c'] except AssertionError: raise TestFailed, "qualified sre.split" if verbose: print "Running tests on sre.findall" try: assert sre.findall(":+", "abc") == [] assert sre.findall(":+", "a:b::c:::d") == [":", "::", ":::"] assert sre.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"] assert sre.findall("(:)(:*)", "a:b::c:::d") == [(":", ""), (":", ":"), (":", "::")] assert sre.findall("(a)|(b)", "abc") == [("a", ""), ("", "b")] except AssertionError: raise TestFailed, "sre.findall" if verbose: print "Running tests on sre.match" try: # No groups at all m = sre.match('a', 'a') ; assert m.groups() == ()
idMap = IdentifierMap() globalReplace = str() if len(sys.argv) > 2: globalReplace = sys.argv[2] globalReplace = globalReplace.split() print globalReplace kernel = psci.Kernel.CreateKernelInCurrentThread() agent = kernel.CreateAgent('obfuscator') kernel.ExecuteCommandLine('source "%s"' % sys.argv[1], 'obfuscator') original = kernel.ExecuteCommandLine('print -f', 'obfuscator') original = original.split('\n') for line in original: match = sre.match(r'^\s*sp\s*\{(.+)\s*$', line) if match is not None: line = line.replace(match.group(1), idMap.getIdentifier(match.group(1))) else: vars = sre.findall(r'<(\S+)>', line) for var in vars: line = line.replace('<' + var + '>', '<' + idMap.getIdentifier(var) + '>') match = sre.match(r'.*\^name ([\w-]+)', line) if match is not None: line = line.replace('^name %s' % match.group(1), '^name %s' % idMap.getIdentifier(match.group(1))) for word in globalReplace: line = line.replace(word, idMap.getIdentifier(word)) print line
"subject": "New apartment ad", "text": "New ad: " + url })): status = True return status match_set = set() # Get listing html source website_handle = retrieveWebPage(listing) website_text = website_handle.read() # Find matches from listing page. matches = sre.findall('class="item_link xiti_ad_heading" .*href="(.*?)"', website_text) # Add matches to match_set for match in matches: match_set.add(match) match_set = list(match_set) # Estabilish db connection connection = MongoClient(mongocon) # Use ads database and blocket collection db = connection.ads.blocket # Iterate urls for item in match_set:
else: f = open(filename) version = f.readlines() f.close() return version[0].rstrip() # Find out script location: script_root = os.path.dirname(os.path.realpath(__file__)) # Do: last_version = loadLastVersion(os.path.join(script_root, last_version_installed_fn)) website_handle = retrieveWebPage(plex_download_address) website_text = website_handle.read() logging.debug('Parsing download page') matches = sre.findall('<a href="(.*_i386.deb)"[^>]*>32-bit</a>', website_text) if len(matches)>1: logging.error('Parsing URL: too many matches') sys.exit(1) logging.debug('Parsing package file URL') version = sre.findall('plexmediaserver_(.*)_i386.deb', matches[0]) if len(version)>1: logging.error('Parsing package URL: too many versions') sys.exit(1) logging.debug('Comparing versions') if last_version == version[0]: logging.info(version[0] + ': is already up-to-date') sys.exit(0) else:
def returnOne(regex, string): data = sre.findall(regex, string) for value in data: return value
import urllib2 import sre import sys #implementation taken from #http://www.techrepublic.com/article/parsing-data-from-the-web-in-python/ def title_from_url( streamurl ): website = None streamtitle = None try: website = urllib2.urlopen(streamurl) except urllib2.HTTPError, e: print("Cannot retrieve URL: HTTP Error Code", e.code) except urllib2.URLError, e: print("Cannot retrieve URL: " , e.reason) if website: pagehtml = website.read() streamtitle = sre.findall("<meta content='(.*?)' property='og:description'>",pagehtml)[0] return streamtitle if __name__ == '__main__': if len(sys.argv) == 1: streamurl = 'http://twitch.tv/morrow' #test string else: streamurl = sys.argv[1] print(title_from_url( streamurl ) )
# -*- coding: utf-8 -*- import urllib2 import re import sre import datetime import time url = "https://autocms.accre.vanderbilt.edu/depotmon/index.html" req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"}) contents = urllib2.urlopen(req).read() #unicode_contents = contents.decode('gbk', 'ignore') unicode_contents = contents.decode('utf-8').encode('utf-8') patternA = 'USED and ' patternB = ' TB FREE' pattern = patternA + "(.*)" + patternB matches = sre.findall(pattern, unicode_contents) now = datetime.datetime.now() dateDay = now.strftime("%m/%d/%Y-%H:%M") fileOutName = "output_vandysize_usable.txt" fileOutput = open(fileOutName, "a") fileOutput.write(time.strftime("%m/%d/%Y-%H:%M")) #print matches # outStrPre='show1image("' + matches[i][3] + '", "' + matches[i][0] + '", "width:100%", "热门", "' + matches[i][1] + '", "");' fileOutput.write(" " + matches[0] + " " + str(float(matches[0]) * 2.0 / 3.0 * 0.85) + "\n") fileOutput.close()
# ------------------------------------------------------------------------------------ Los gehts! if cronJob : chdir(workDir) print "\nArbeitsverzeichnis: "+getcwd()+"\n" saveTxtFile("letzer zugriff: "+meineZeit(),"zg.txt") newDirCh(beautifulCrs(crs)) website_html = getSitePwd(genURL(year,sem,crs),usr,pwd) matches = sre.findall('<video xmlns:xsi=.*?<\/video>', website_html) for match in matches: print "\n\n =============================================================== <<o>>" title = returnOne('title=".*?"',match) datum = title[10:20] name = title[21:len(title)-1] newDirCh(datum) print "\n---> "+year+" "+beautifulSem(sem)+" "+beautifulCrs(crs)+" Datum: "+datum+" Name: "+name+"\n" #Kommentare if cmt == "j":
globalReplace = sys.argv[2] globalReplace = globalReplace.split() print globalReplace kernel = psci.Kernel.CreateKernelInCurrentThread() agent = kernel.CreateAgent('obfuscator') kernel.ExecuteCommandLine('source "%s"' % sys.argv[1], 'obfuscator') original = kernel.ExecuteCommandLine('print -f', 'obfuscator') original = original.split('\n') for line in original: match = sre.match(r'^\s*sp\s*\{(.+)\s*$', line) if match is not None: line = line.replace(match.group(1), idMap.getIdentifier(match.group(1))) else: vars = sre.findall(r'<(\S+)>', line) for var in vars: line = line.replace('<' + var + '>', '<' + idMap.getIdentifier(var) + '>') match = sre.match(r'.*\^name ([\w-]+)', line) if match is not None: line = line.replace( '^name %s' % match.group(1), '^name %s' % idMap.getIdentifier(match.group(1))) for word in globalReplace: line = line.replace(word, idMap.getIdentifier(word)) print line
if len(sys.argv) < 2: print "Usage:" print "%s url" % (sys.argv[0]) sys.exit(1) match_set = set() address = parseAddress(sys.argv[1]) website_handle = retrieveWebPage(address) website_text = website_handle.read() dir = website_handle.geturl().rsplit('/',1)[0] if (dir == "http:/"): dir = website_handle.geturl() matches = sre.findall('<img .*src="(.*?)"', website_text) for match in matches: if match[:7] != "http://": if match[0] == "/": slash = "" else: slash = "/" match_set.add(dir + slash + match) else: match_set.add(match) match_set = list(match_set) match_set.sort() for item in match_set:
def Netflix(): service = sre.findall('friendlyName":"([^"]+)', HTMLsource2) if 'Netflix Instant' in service: return True else: return False
import urllib2 import sre import sys #implementation taken from #http://www.techrepublic.com/article/parsing-data-from-the-web-in-python/ def title_from_url(streamurl): website = None streamtitle = None try: website = urllib2.urlopen(streamurl) except urllib2.HTTPError, e: print("Cannot retrieve URL: HTTP Error Code", e.code) except urllib2.URLError, e: print("Cannot retrieve URL: ", e.reason) if website: pagehtml = website.read() streamtitle = sre.findall( "<meta content='(.*?)' property='og:description'>", pagehtml)[0] return streamtitle if __name__ == '__main__': if len(sys.argv) == 1: streamurl = 'http://twitch.tv/morrow' #test string else: streamurl = sys.argv[1] print(title_from_url(streamurl))
data={"from": mailfrom+" <"+mailfromemail+">", "to": [mailto], "subject": "New apartment ad", "text": "New ad: "+url})) : status = True; return status match_set = set() # Get listing html source website_handle = retrieveWebPage(listing) website_text = website_handle.read() # Find matches from listing page. matches = sre.findall('class="item_link xiti_ad_heading" .*href="(.*?)"', website_text) # Add matches to match_set for match in matches: match_set.add(match) match_set = list(match_set) # Estabilish db connection connection = MongoClient(mongocon) # Use ads database and blocket collection db = connection.ads.blocket # Iterate urls for item in match_set:
def Repeats(s, type): return len(sre.findall(type, s.data))