Пример #1
0
    def load(self, address):
        try:
            web_handle = urllib2.urlopen(address)
            web_text = web_handle.read()
     #       matches = sre.findall('\<td class="pl"\>(.*?)\&', web_text)
     #       matches = sre.findall('\>(.*?)\&nbsp;\<', web_text)
            date_match = sre.findall('(\d{1,2}\-\d{1,2}\-\d{2})', web_text)

            lines = sre.findall('\<td class="plleft"\>(.*?)\</td\>\</tr\>', \
                                web_text)
                
            if (date_match != []):
                date = date_match[1];
                date = datetime.strptime(date, "%m-%d-%y")
                date = date.isocalendar()
                
                for line in lines:
                    
                    artist = ""
                    song = ""
                    album = ""

                    matches = sre.findall('\<td class="pl"\>(.*?)\&nbsp', line)
                    tracker = 1
                    playlist = True
                    
                    for match in matches:
                        if tracker == 1:
                            artist = match
                            tracker = 2
                        elif tracker == 2:
                            song = match
                            tracker = 3
                        elif tracker == 3:
                            album = match
                            self.add_song(artist, song, album, date)
 
                            tracker = 4
                        elif tracker ==4:
                            tracker =1 
                        else:
                            print "Wtf this shouldn't happen."
            else:
                playlist = False
                pass
                print "No playlist checkpoint 1"

            return playlist
        
        except urllib2.HTTPError, e:
            print "Cannot retreieve URL: HTTP Error Code", e.code
Пример #2
0
	def online(self, word_to_find):
		setdefaulttimeout(5)
		website = urlopen(Request(web_dict + word_to_find)).read()
		if findall('<META NAME="ROBOTS" CONTENT="NOINDEX,FOLLOW">', website):
			return "LMGTFY " + google_search + word_to_find
		else:
			return "Try " + web_dict + word_to_find
Пример #3
0
def getgranularity(formattype):
  """returns the granularity range available from the given formattype"""
  # TODO: include all the formatting codes, weeks, etc
  year, month, day, hour, minute, second = range(6)
  finestgranularity = year
  widestgranularity = second
  for formatstr in sre.findall("%[a-zA-Z]", formattype):
    formatcode = formatstr[1]
    if formatcode in "jyY":
      codegranularity = year
    elif formatcode in "bBm":
      codegranularity = month
    elif formatcode in "aAdj":
      codegranularity = day
    elif formatcode in "HIp":
      codegranularity = hour
    elif formatcode in "M":
      codegranularity = minute
    elif formatcode in "S":
      codegranularity = second
    if codegranularity > finestgranularity:
      finestgranularity = codegranularity
    if codegranularity < widestgranularity:
      widestgranularity = codegranularity
  return finestgranularity, widestgranularity
Пример #4
0
def get_data():
    try:
        website = urllib2.urlopen(address)
        html = website.read()
        matches = sre.findall('<TD>[A-Za-z0-9\.]*', html)
        return matches
    except:
        return "Could not retrieve data."
Пример #5
0
def getModuleNameFromLine(word, line, column):
    # Take part of the line until column to make sure we don't get any matches after that.
    match = sre.findall(r'(?:[a-zA-Z0-9_]*\.)+'+word, line[:column])
    if not match:
        # We're not completing a modulename, so we return None
        return None
    # To be sure it's the right match, we take the last one and strip off the . and the word
    result = match[-1][:-len("."+word)]
    return result
Пример #6
0
def harvest_page(url, body):
    global harvest_regexp, harvlogfile
    print "OMG HARVESTING ROFL"
    list = sre.findall(harvest_regexp, body)
    print url + ":", str(list)
    if len(list) > 0:
        fp = open("logs/" + harvlogfile, "a")
        fp.write(url + ": " + str(list) + "\n\n")
        fp.close()
    print "DONE WITH", url
Пример #7
0
def harvest_page(url,body):
	global harvest_regexp,harvlogfile
	print "OMG HARVESTING ROFL"
	list=sre.findall(harvest_regexp,body)
	print url+":",str(list)
	if len(list)>0:
		fp=open("logs/"+harvlogfile,"a")
		fp.write(url+": "+str(list)+"\n\n")
		fp.close()
	print "DONE WITH",url
Пример #8
0
 def lyrics(artist, song):
     try:
         address = 'http://www.azlyrics.com/lyrics/' + \
                   artist.replace(' ', '').lower() + '/' + \
                   song.replace(' ', '').lower() + '.html'
         web_handle = urllib2.urlopen(address)
         web_text = web_handle.read()
         lyrics = sre.findall('(?s)<!-- start of lyrics -->.*?<!', web_text, sre.MULTILINE)
         
         return lyrics
     except urllib2.HTTPError, e:
         print "Cannot retreieve URL: HTTP Error Code", e.code
Пример #9
0
def printamp(line):
    #print '@unblock', line, '->',
    m = sre.findall('#AT', line)
    outline = line
    if not m:
        #print outline
        return (outline)
    else:
        for i in range(len(m)):
            outline = outline.replace('#AT', '@')
        #print outline
        return (outline)
Пример #10
0
def printamp(line):
    #print '@unblock', line, '->',
    m = sre.findall('#AT',line)
    outline = line
    if not m :
        #print outline
        return(outline)
    else:
        for i in range(len(m)):
            outline = outline.replace('#AT','@')
        #print outline
        return(outline)
Пример #11
0
def unblock(line):
    #print '@unblock', line, '->',
    m = sre.findall('@[^\s]+', line)
    outline = line
    if not m:
        #print outline
        return (outline)
    else:
        for i in range(len(m)):
            s = m[i].replace('@', '').replace('%X%', ' ')
            outline = outline.replace(m[i], s)
        #print outline
        return (outline)
Пример #12
0
def blocked(line):
    #print '@ blocked', line , '->',
    m = sre.findall('@[^@]+@', line)
    outline = line
    if not m:
        #print  outline
        return (outline)
    else:
        for i in range(len(m)):
            s = m[i][:-1].replace(' ', '%X%')
            outline = outline.replace(m[i], s, 1)
        #print outline
        return (outline)
Пример #13
0
def unblock(line):
    #print '@unblock', line, '->',
    m = sre.findall('@[^\s]+',line)
    outline = line
    if not m :
        #print outline
        return(outline)
    else:
        for i in range(len(m)):
            s=m[i].replace('@','').replace('%X%',' ')
            outline = outline.replace(m[i],s)
        #print outline
        return(outline)
Пример #14
0
def blocked(line):
    #print '@ blocked', line , '->',
    m = sre.findall('@[^@]+@',line)
    outline = line
    if not m :
        #print  outline
        return(outline)
    else:
        for i in range(len(m)):
            s=m[i][:-1].replace(' ','%X%')
            outline = outline.replace(m[i],s,1)
        #print outline
        return(outline)
Пример #15
0
    def lyrics(self, artist, song):
        'Enter artist and song to find lyrics on one of two online databases'
        song = song.replace(',','')
        try:
            begin = artist[0]+artist[1]+artist[2]
            if begin.lower() == 'the':
                artist = artist[4:]

        
            address = 'http://www.azlyrics.com/lyrics/' + \
                      artist.replace(' ', '').lower() + '/' + \
                      song.replace(' ', '').lower() + '.html'
            web_handle = urllib2.urlopen(address)
            web_text = web_handle.read()
            lyrics = sre.findall('(?s)<!-- start of lyrics -->(.*?)<!', web_text, sre.MULTILINE)
            lyrics = lyrics[0]
            lyrics.replace('\n', '')
            lyrics.replace('\r\n', '')
            return lyrics
        except urllib2.HTTPError, urllib2.URLError: # if not found, try different website
            try:
                begin = artist[0]+artist[1]+artist[2]
                if begin.lower() == 'the':
                    artist = artist[4:]
                    
                address = 'http://indierocklyrics.com/' + \
                          artist.replace(' ', '-').lower() + '/' + \
                          song.replace(' ', '-').lower() + '-lyrics'
                web_handle = urllib2.urlopen(address)
                web_text = web_handle.read()
                lyrics = sre.findall('(?s)<p>\&nbsp;<br\ />(.*?)<a\ href\=\"http://www'\
                                     , web_text, sre.MULTILINE)
                lyrics = lyrics[0]
                lyrics.replace('\n', '')
                lyrics.replace('\r\n', '')
                return lyrics
            except urllib2.HTTPError, e:
                print "Cannot retreieve URL: HTTP Error Code", e.code
Пример #16
0
 def __parse_attribute(self, s):
   avlist = { }
   map(lambda x: avlist.update({x[0]: x[1]}), re.findall("(\w+)=\"([^\"]*)\"\s*", s))
   return (avlist)
Пример #17
0
def scanSGF (target):
	if isdir(target):
		for sgf_file in os.listdir(target):
			if sgf_file!='Desktop':
				scanSGF(target+os.sep+sgf_file)
	elif isfile(target) and target[len(target)-4:]=='.sgf':
		try:
			#Check to see if there is already a result tag. At the moment, it will accept any sort
			#of result tag, but this is easily modified to replace result tags that are improper.
			if len(sre.findall('(RE\[.*\])',file(target,'r').read())) >0:
				print target+" already has a result. Skipping..."
			else:
				print target+":",
				next_move = sre.findall('([B|W])\[[a-z][a-z]\]',file(target,'r').read())[-1]
				#next_move looks inside the SGF to find the last player who made a move. This is so that later, the
				#GnuGo engine knows for which player to generate a move.
				if next_move=='W':
					next_move='black'
				else:
					next_move='white'
				#The black/white syntax is needed by the GTP protocol.
				gtp_test = file('gtp_test','w')
				gtp_test.write('reg_genmove '+next_move+'\ntop_moves\nquit')
				gtp_test.flush()
				#Although it would technically be possible to bind gnugo's STDIN and STDOUT to Python, it is just
				#so much simpler to put the commands in a file. The file is deleted later anyway.
				gnugo_session = os.popen('gnugo --mode gtp --gtp-input gtp_test -l '+target).read()
				if len(sre.findall('PASS',gnugo_session))>0:
					move_value = 0
					#If GnuGo thinks the best move is to pass, then the game is obviously over, and setting
					#move_value to 0 will ensure that the game will later be given to GnuGo to estimate score.
				else:
					move_value = sre.findall('([\d\d|\d]\.[\d|\d\d])',gnugo_session)[0]
					#Since GnuGo will give the values of the move in reverse order that they are played, the
					#value of the most recent move (which we generated in gtp_test) will be the first one.
					#This is the value we want to check for size.
				if next_move=='black':
					next_move='W'
				else:
					next_move='B'
					#I am making an assumption here, that the last person to move is going to win the game.
					#It seems silly for a player to make a move and then resign, but it is not an impossibility.
					#Therefore, if you need that extra bit of accuracy, you can make some minor modifications
					#to check the estimated score regardless of whether the game ended in resign or not, and
					#use that as a sign of who won.
				game_result = next_move+'+R'
				if float(move_value)<2:
					#If the value of the move generated by GnuGo is less than 2, then it is  clear that the late
					#endgame has been reached, and the game is probably over. In this case, we will use GnuGo
					#to calculate the relative score.
					result_string = os.popen('gnugo -l '+target+' --score estimate').read()
					winning_color = result_string[:1]
					score_estimate = sre.findall('(\d.\d)',result_string)[0]
					game_result = winning_color+'+'+score_estimate
				print game_result
				sgf_raw = file(target,'r')
				file_dump = sgf_raw.read()
				file_dump = sre.sub(RE_PREFIX,PREFIX+'RE['+game_result+']',file_dump)
				sgf_write=file(target,'w')
				sgf_write.write(file_dump)
				sgf_write.flush()
				sgf_write.close()
				os.remove('gtp_test')
				#Remove the old gtp_test.
		except IndexError:
			print "Error with SGF "+target+". Deleting ..."
			error_log = file('error_log','a')
			error_log.write("Error on "+target+". Deleting file.\n")
			error_log.flush()
			error_log.close()
			os.remove(target)
			#Uncomment previous line if you would like to keep illegal SGF's.
		except Exception:
			print "Error. Skipping ..."
			print Exception
			error_log = file('error_log','a')
			error_log.write("Error on "+target+". Skipping file.\n")
			error_log.flush()
			error_log.close()
		except Error:
			print "Error. Skipping ..."
			print Error
			error_log = file('error_log','a')
			error_log.write("Error on "+target+". Skipping file.\n")
			error_log.flush()
			error_log.close()
Пример #18
0
import sys
import urllib2
import sre
from bs4 import BeautifulSoup

url = "http://www.2dehands.be/autos/?show_markt_uitvoer=1&locale=all&plaatsdatum__x=30&auto_bj__tot=1980"

website = urllib2.urlopen(url)
website_html = website.read()
    
soup = BeautifulSoup(website_html)
nice = soup.prettify()
print nice
matches = sre.findall('<a href="http://www.2dehands.be/autos/.*', nice)
print matches
for link in soup.find_all('a'):
  print(link.get('href'))



#  matches = sre.findall('<a href="http://www.2dehands.be/autos/.*', nice)
#  print matches
Пример #19
0
import sys
import BaseHTTPServer
import urllib2
import sre

outf = open('texts3.txt','w')
sys.stdout = outf;

for i in range(20,30):                                       
	sock = urllib2.urlopen("https://www.goodreads.com/quotes?page="+str(i))
	htmlSource = sock.read()                            
	sock.close()                                        
	#print htmlSource 
	matches = sre.findall('&ldquo;(.*?)&rdquo;', htmlSource)
	for stri in matches:
		print stri
Пример #20
0
    raise TestFailed, "sre.split"

try:
    assert sre.split(":", ":a:b::c", 2) == ['', 'a', 'b::c']
    assert sre.split(':', 'a:b:c:d', 2) == ['a', 'b', 'c:d']

    assert sre.split("(:)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c']
    assert sre.split("(:*)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c']
except AssertionError:
    raise TestFailed, "qualified sre.split"

if verbose:
    print "Running tests on sre.findall"

try:
    assert sre.findall(":+", "abc") == []
    assert sre.findall(":+", "a:b::c:::d") == [":", "::", ":::"]
    assert sre.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"]
    assert sre.findall("(:)(:*)", "a:b::c:::d") == [(":", ""),
                                                   (":", ":"),
                                                   (":", "::")]
    assert sre.findall("(a)|(b)", "abc") == [("a", ""), ("", "b")]
except AssertionError:
    raise TestFailed, "sre.findall"

if verbose:
    print "Running tests on sre.match"

try:
    # No groups at all
    m = sre.match('a', 'a') ; assert m.groups() == ()
Пример #21
0
idMap = IdentifierMap()

globalReplace = str()
if len(sys.argv) > 2:
    globalReplace = sys.argv[2]
globalReplace = globalReplace.split()
print globalReplace

kernel = psci.Kernel.CreateKernelInCurrentThread()
agent = kernel.CreateAgent('obfuscator')
kernel.ExecuteCommandLine('source "%s"' % sys.argv[1], 'obfuscator')
original = kernel.ExecuteCommandLine('print -f', 'obfuscator')
original = original.split('\n')
for line in original:
    match = sre.match(r'^\s*sp\s*\{(.+)\s*$', line)
    if match is not None:
        line = line.replace(match.group(1), idMap.getIdentifier(match.group(1)))
    else:
        vars = sre.findall(r'<(\S+)>', line)
        for var in vars:
            line = line.replace('<' + var + '>', '<' + idMap.getIdentifier(var) + '>')

        match = sre.match(r'.*\^name ([\w-]+)', line)
        if match is not None:
            line = line.replace('^name %s' % match.group(1), '^name %s' % idMap.getIdentifier(match.group(1)))

    for word in globalReplace:
        line = line.replace(word, idMap.getIdentifier(word))
        
    print line
        
Пример #22
0
                          "subject": "New apartment ad",
                          "text": "New ad: " + url
                      })):
        status = True

    return status


match_set = set()

# Get listing html source
website_handle = retrieveWebPage(listing)
website_text = website_handle.read()

# Find matches from listing page.
matches = sre.findall('class="item_link xiti_ad_heading" .*href="(.*?)"',
                      website_text)

# Add matches to match_set
for match in matches:
    match_set.add(match)

match_set = list(match_set)

# Estabilish db connection
connection = MongoClient(mongocon)

# Use ads database and blocket collection
db = connection.ads.blocket

# Iterate urls
for item in match_set:
Пример #23
0
	else:
		f = open(filename)
		version = f.readlines()
		f.close()
		return version[0].rstrip()
		
# Find out script location:
script_root = os.path.dirname(os.path.realpath(__file__))

# Do:
last_version   = loadLastVersion(os.path.join(script_root, last_version_installed_fn))
website_handle = retrieveWebPage(plex_download_address)
website_text   = website_handle.read()

logging.debug('Parsing download page')
matches = sre.findall('<a href="(.*_i386.deb)"[^>]*>32-bit</a>', website_text)
if len(matches)>1:
	logging.error('Parsing URL: too many matches')
	sys.exit(1)

logging.debug('Parsing package file URL')
version = sre.findall('plexmediaserver_(.*)_i386.deb', matches[0])
if len(version)>1:
	logging.error('Parsing package URL: too many versions')
	sys.exit(1)

logging.debug('Comparing versions')
if last_version == version[0]:
	logging.info(version[0] + ': is already up-to-date')
	sys.exit(0)
else:
Пример #24
0
def returnOne(regex, string):
	data  = sre.findall(regex, string)
	for value in data:
		return value
Пример #25
0
import urllib2
import sre
import sys

#implementation taken from
#http://www.techrepublic.com/article/parsing-data-from-the-web-in-python/

def title_from_url( streamurl ):
    website = None
    streamtitle = None
    try:
        website = urllib2.urlopen(streamurl)
    except urllib2.HTTPError, e:
            print("Cannot retrieve URL: HTTP Error Code", e.code)
    except urllib2.URLError, e:
            print("Cannot retrieve URL: " , e.reason)
    if website:
        pagehtml = website.read()
        streamtitle = sre.findall("<meta content='(.*?)' property='og:description'>",pagehtml)[0]
    return streamtitle

if __name__ == '__main__':
    if len(sys.argv) == 1:
        streamurl = 'http://twitch.tv/morrow' #test string
    else:
        streamurl = sys.argv[1]
    print(title_from_url( streamurl ) )
Пример #26
0
# -*- coding: utf-8 -*-

import urllib2
import re
import sre
import datetime
import time

url = "https://autocms.accre.vanderbilt.edu/depotmon/index.html"
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
contents = urllib2.urlopen(req).read()
#unicode_contents = contents.decode('gbk', 'ignore')
unicode_contents = contents.decode('utf-8').encode('utf-8')
patternA = 'USED and '
patternB = ' TB FREE'
pattern = patternA + "(.*)" + patternB
matches = sre.findall(pattern, unicode_contents)

now = datetime.datetime.now()
dateDay = now.strftime("%m/%d/%Y-%H:%M")
fileOutName = "output_vandysize_usable.txt"
fileOutput = open(fileOutName, "a")
fileOutput.write(time.strftime("%m/%d/%Y-%H:%M"))

#print matches
#   outStrPre='show1image("' + matches[i][3] + '", "' + matches[i][0] + '", "width:100%", "热门", "' + matches[i][1] + '", "");'

fileOutput.write("   " + matches[0] + "   " +
                 str(float(matches[0]) * 2.0 / 3.0 * 0.85) + "\n")
fileOutput.close()
Пример #27
0
# ------------------------------------------------------------------------------------ Los gehts!
if cronJob :
	chdir(workDir)
print "\nArbeitsverzeichnis: "+getcwd()+"\n"

saveTxtFile("letzer zugriff: "+meineZeit(),"zg.txt")


newDirCh(beautifulCrs(crs))


website_html = getSitePwd(genURL(year,sem,crs),usr,pwd)


matches = sre.findall('<video xmlns:xsi=.*?<\/video>', website_html)

for match in matches:

	print "\n\n =============================================================== <<o>>"

	title = returnOne('title=".*?"',match)
	datum = title[10:20]
	name  = title[21:len(title)-1]
	
	newDirCh(datum)

	print "\n---> "+year+" "+beautifulSem(sem)+" "+beautifulCrs(crs)+" Datum: "+datum+" Name: "+name+"\n"

	#Kommentare
	if cmt == "j":	
Пример #28
0
    globalReplace = sys.argv[2]
globalReplace = globalReplace.split()
print globalReplace

kernel = psci.Kernel.CreateKernelInCurrentThread()
agent = kernel.CreateAgent('obfuscator')
kernel.ExecuteCommandLine('source "%s"' % sys.argv[1], 'obfuscator')
original = kernel.ExecuteCommandLine('print -f', 'obfuscator')
original = original.split('\n')
for line in original:
    match = sre.match(r'^\s*sp\s*\{(.+)\s*$', line)
    if match is not None:
        line = line.replace(match.group(1),
                            idMap.getIdentifier(match.group(1)))
    else:
        vars = sre.findall(r'<(\S+)>', line)
        for var in vars:
            line = line.replace('<' + var + '>',
                                '<' + idMap.getIdentifier(var) + '>')

        match = sre.match(r'.*\^name ([\w-]+)', line)
        if match is not None:
            line = line.replace(
                '^name %s' % match.group(1),
                '^name %s' % idMap.getIdentifier(match.group(1)))

    for word in globalReplace:
        line = line.replace(word, idMap.getIdentifier(word))

    print line
Пример #29
0
if len(sys.argv) < 2:
        print "Usage:"
        print "%s url" % (sys.argv[0])
        sys.exit(1)

match_set = set()

address = parseAddress(sys.argv[1])
website_handle = retrieveWebPage(address)
website_text = website_handle.read()

dir = website_handle.geturl().rsplit('/',1)[0]
if (dir == "http:/"):
        dir = website_handle.geturl()

matches = sre.findall('<img .*src="(.*?)"', website_text)

for match in matches:
        if match[:7] != "http://":
                if match[0] == "/":
                        slash = ""
                else:
                        slash = "/"
                match_set.add(dir + slash + match)
        else:
                match_set.add(match)

match_set = list(match_set)
match_set.sort()

for item in match_set:
def Netflix():
	service = sre.findall('friendlyName":"([^"]+)', HTMLsource2)
	if 'Netflix Instant' in service:
		return True
	else:
		return False
Пример #31
0
import urllib2
import sre
import sys

#implementation taken from
#http://www.techrepublic.com/article/parsing-data-from-the-web-in-python/


def title_from_url(streamurl):
    website = None
    streamtitle = None
    try:
        website = urllib2.urlopen(streamurl)
    except urllib2.HTTPError, e:
        print("Cannot retrieve URL: HTTP Error Code", e.code)
    except urllib2.URLError, e:
        print("Cannot retrieve URL: ", e.reason)
    if website:
        pagehtml = website.read()
        streamtitle = sre.findall(
            "<meta content='(.*?)' property='og:description'>", pagehtml)[0]
    return streamtitle


if __name__ == '__main__':
    if len(sys.argv) == 1:
        streamurl = 'http://twitch.tv/morrow'  #test string
    else:
        streamurl = sys.argv[1]
    print(title_from_url(streamurl))
Пример #32
0
		data={"from": mailfrom+" <"+mailfromemail+">", 
		"to": [mailto], 
		"subject": "New apartment ad", 
		"text": "New ad: "+url})) :
		status = True;

	return status

match_set = set()

# Get listing html source
website_handle = retrieveWebPage(listing)
website_text = website_handle.read()

# Find matches from listing page.
matches = sre.findall('class="item_link xiti_ad_heading" .*href="(.*?)"', 
	website_text)

# Add matches to match_set
for match in matches:
	match_set.add(match)

match_set = list(match_set)

# Estabilish db connection
connection = MongoClient(mongocon)

# Use ads database and blocket collection
db = connection.ads.blocket

# Iterate urls
for item in match_set:
Пример #33
0
 def Repeats(s, type):
     return len(sre.findall(type, s.data))