def on_status(self, status): global follow_list #format time into 2011-02-23T16:42:40+0000 format ala facebook #Twitter Format: Wed Mar 23 22:51:50 +0000 2011 formattedTime = self.formatTime(status['created_at']) hashtags = [] if len(status['entities']['hashtags']): for val in status['entities']['hashtags']: hashtags.append(val['text'].replace("'", "\\'")) hashtag = ','.join(hashtags) urls = [] if len(status['entities']['urls']): for val in status['entities']['urls']: urls.append(val['url'].replace("'", "\\'")) url = ','.join(urls) #print status['text'] text = status['text'].replace("'", "\\'") if text[-1] == '\\': text = text + " " if str(status['user']['id']) in follow_list: file_put_contents(str(status['user']['screen_name']) + " posted something") infoModule.info.site['dblink'] = mysql_tools.db_connect() sql = u"INSERT INTO `peepbuzz`.`twitter_queue` SET `status_id` = '" + str(status['id']) + "', `created` = '" + formattedTime + "', `promoter_id` = '" + str(status['user']['id']) + "', `promoter` = '" + status['user']['screen_name'] + "', `thumbnail` = '" + str(status['user']['profile_image_url']) + "', `summary` = '" + text + "', `external_id` = '" + str(status['user']['id']) + "', `hashtags` = '" + hashtag + "', `urls` = '" + url + "'"; mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) infoModule.info.site['dblink'].close() else: pass
def cleanup(days): link = mysql_tools.db_connect() query = 'SELECT filament_id, story_id FROM peepbuzz.filaments WHERE created <= DATE_SUB(NOW(), INTERVAL '+str(days)+' DAY)' result = mysql_tools.mysqlQuery(query, link) while (1): row = result.fetch_row(1,1) if row == (): break query = 'DELETE from peepbuzz.filaments WHERE filament_id = "'+str(row[0]['filament_id'])+'"' try: result2 = mysql_tools.mysqlQuery(query, link) except: pprint.pprint(query) sys.exit(1) if row[0]['story_id'] != None: query = 'SELECT count(*) from peepbuzz.filaments WHERE story_id = "'+str(row[0]['story_id'])+'"' try: result2 = mysql_tools.mysqlQuery(query, link) except: pprint.pprint(query) sys.exit(1) row = result2.fetch_row(1,1) if row == None: break if row[0] == 0: query = 'DELETE FROM peepbuzz.stories WHERE story_id = "'+str(row[0]['story_id'])+'"' try: result2 = mysql_tools.mysqlQuery(query, link) except: pprint.pprint(query) sys.exit(1) return True
def test_find_account(self): infoModule.info.site['dblink'] = mysql_tools.db_connect() dblink = infoModule.info.site['dblink'] self.stream_id = 1 self.external_id = 'acctfndrtest' self.user_name = 'account finder' self.thumbnail = 'http://newdisorder.com/image/trabant_cover.gif' res = accountFinder.accountFinder(self.stream_id, self.external_id, self.user_name, self.thumbnail) self.assertEqual(res[1], 'accounts') self.new_id = res[0] res = accountFinder.accountFinder(self.stream_id, self.external_id, self.user_name, self.thumbnail) self.assertEqual(res[0], self.new_id)
def main(): global follow_list infoModule.info.site['dblink'] = mysql_tools.db_connect() auth = BasicAuthHandler(twitterAccount, tiwtterPassword) stream = newStream(auth, StreamWatcherListener(), timeout=None) follow_list = getUserList() # Demo Mode #follow_list = getUserList(True) infoModule.info.site['dblink'].close() if len(follow_list) == 0: file_put_contents('Could not get a list of users to follow') sys.exit() stream.filter(follow=follow_list)
def setUp(self): infoModule.info.site['dblink'] = mysql_tools.db_connect() dblink = infoModule.info.site['dblink'] #set up accounts for test # get valid account sql = "SELECT user_id from peepbuzz.users limit 1" userQ = mysql_tools.mysqlQuery(sql, dblink) user = userQ.fetch_row(1, 1) sql = "SELECT account_id from peepbuzz.accounts limit 1" accountQ = mysql_tools.mysqlQuery(sql, dblink) account = accountQ.fetch_row(1, 1) self.account_id = account[0]['account_id'] self.user_id = user[0]['user_id'] sql = "insert into peepbuzz.blocked_accounts set user_id=" + self.user_id + ", unknown_account_id=" + self.unknown_account_id + ", account_id=" + self.account_id testQ = mysql_tools.mysqlQuery(sql, dblink) self.ba_id = dblink.insert_id()
def setUp(self): infoModule.info.site['dblink'] = mysql_tools.db_connect() dblink = infoModule.info.site['dblink'] #set up accounts for test # get valid account sql = "SELECT user_id from peepbuzz.users limit 1" userQ = mysql_tools.mysqlQuery(sql, dblink) user = userQ.fetch_row(1,1) sql = "SELECT account_id from peepbuzz.accounts limit 1" accountQ = mysql_tools.mysqlQuery(sql, dblink) account = accountQ.fetch_row(1,1) self.account_id = account[0]['account_id'] self.user_id = user[0]['user_id'] sql = "insert into peepbuzz.blocked_accounts set user_id=" + self.user_id + ", unknown_account_id=" + self.unknown_account_id + ", account_id=" + self.account_id testQ = mysql_tools.mysqlQuery(sql, dblink) self.ba_id = dblink.insert_id()
def on_status(self, status): global follow_list #format time into 2011-02-23T16:42:40+0000 format ala facebook #Twitter Format: Wed Mar 23 22:51:50 +0000 2011 formattedTime = self.formatTime(status['created_at']) hashtags = [] if len(status['entities']['hashtags']): for val in status['entities']['hashtags']: hashtags.append(val['text'].replace("'", "\\'")) hashtag = ','.join(hashtags) urls = [] if len(status['entities']['urls']): for val in status['entities']['urls']: urls.append(val['url'].replace("'", "\\'")) url = ','.join(urls) #print status['text'] text = status['text'].replace("'", "\\'") if text[-1] == '\\': text = text + " " if str(status['user']['id']) in follow_list: file_put_contents( str(status['user']['screen_name']) + " posted something") infoModule.info.site['dblink'] = mysql_tools.db_connect() sql = u"INSERT INTO `peepbuzz`.`twitter_queue` SET `status_id` = '" + str( status['id'] ) + "', `created` = '" + formattedTime + "', `promoter_id` = '" + str( status['user']['id'] ) + "', `promoter` = '" + status['user'][ 'screen_name'] + "', `thumbnail` = '" + str( status['user']['profile_image_url'] ) + "', `summary` = '" + text + "', `external_id` = '" + str( status['user']['id'] ) + "', `hashtags` = '" + hashtag + "', `urls` = '" + url + "'" mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) infoModule.info.site['dblink'].close() else: pass
def cleanup(days): link = mysql_tools.db_connect() query = 'SELECT filament_id, story_id FROM peepbuzz.filaments WHERE created <= DATE_SUB(NOW(), INTERVAL ' + str( days) + ' DAY)' result = mysql_tools.mysqlQuery(query, link) while (1): row = result.fetch_row(1, 1) if row == (): break query = 'DELETE from peepbuzz.filaments WHERE filament_id = "' + str( row[0]['filament_id']) + '"' try: result2 = mysql_tools.mysqlQuery(query, link) except: pprint.pprint(query) sys.exit(1) if row[0]['story_id'] != None: query = 'SELECT count(*) from peepbuzz.filaments WHERE story_id = "' + str( row[0]['story_id']) + '"' try: result2 = mysql_tools.mysqlQuery(query, link) except: pprint.pprint(query) sys.exit(1) row = result2.fetch_row(1, 1) if row == None: break if row[0] == 0: query = 'DELETE FROM peepbuzz.stories WHERE story_id = "' + str( row[0]['story_id']) + '"' try: result2 = mysql_tools.mysqlQuery(query, link) except: pprint.pprint(query) sys.exit(1) return True
if(len(be['images'])>0): for img in be['images']: query = u'insert into peepbuzz.story_images (story_id, url, width, height) values ("'+str(story_id)+'","'+str(img['url'])+'","'+str(img['width'])+'","'+str(img['height'])+'")' print query try: mysql_tools.mysqlQuery(query, infoModule.info.site['dblink']) except: return False # check videos if(len(be['videos'])>0): for vid in be['videos']: #check for dupes sql = "SELECT video_id FROM peepbuzz.story_videos WHERE story_id=" + str(story_id) + " and url='" + str(vid['url']) + "'" video_dupe_check_q = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if video_dupe_check_q.num_rows() == 0: embed_code = vid['embed_code'].replace("'", "\\'") query = u"insert into peepbuzz.story_videos (story_id, url, embed_code, width, height) values ('"+str(story_id)+"','"+str(vid['url'])+"','"+embed_code+"','"+str(vid['width'])+"','"+str(vid['height'])+"')" try: mysql_tools.mysqlQuery(query, infoModule.info.site['dblink']) except: infoModule.info.errorList.append("failed to add video") infoModule.info.errorList.append(query) return True if __name__ == "__main__": link = mysql_tools.db_connect() if link == False : print "no connection" sys.exit(0) infoModule.info.site['dblink'] = link
disasterTimeout = 120.0 #better be able to get these all in 2 minutes startTime = time.time() while True: if explodedCount == len(url_list): break #disaster timeout and crash if time.time() - startTime > disasterTimeout: print "explodeUrls timeout" break return res if __name__ == "__main__": infoModule.info.site['dblink'] = mysql_tools.db_connect() infoModule.info.source['body_extractor_servers'] = [ ('192.168.0.100', '1348'), ('192.168.0.26', '1348') ] URLList = [ 'http://bit.ly/hrRze1', 'http://twitpic.com/43s13z', 'http://twitpic.com/43ouap', 'http://twitpic.com/43ouak', 'http://fb.me/ETYTKELP', 'http://fb.me/ESd8X8Bl', 'http://bit.ly/gZAJzU', 'http://bit.ly/eBIhUP', 'http://bit.ly/eTzXuu', 'http://bit.ly/glhMcB', 'http://fb.me/QSucj2IG', 'http://fb.me/VnX2tOZn', 'http://ow.ly/i/8ury', 'http://t.co/OwnQba6', 'http://bit.ly/ibSNUF', 'http://fb.me/IfoHBSoh', 'http://ow.ly/i/8ssC', 'http://fb.me/S7Fmh2Ro', 'http://twitpic.com/42o42e', 'http://bit.ly/e7YkQA', 'http://fb.me/RVtX25f4', 'http://img.ly/34lS', 'http://img.ly/34lj', 'http://fb.me/IjEMbpZF', 'http://bit.ly/fytGPO', 'http://tinyurl.com/4bew58q', 'http://ow.ly/i/8qnV',
def main(): sleep = 1 * 60 # Seconds to sleep pidPath = "/tmp/twitterUserStream.pid" streamPath = "twitterUserStream.py" pid = None userCount = None lastUserCount = None noCheck = False running = False # Create a loop while True: infoModule.info.site['dblink'] = mysql_tools.db_connect() # New Loop so move the user counts lastUserCount = userCount userCount = None # Check the file for a PID try: file = open(pidPath) while True: line = file.readline() if not line: break pid = line if pid: # Get the status of the PID try: os.kill(int(pid), 0) except OSError: running = False else: running = True print str(pid) + " - is running?: " + str(running) else: running = False except IOError: # We dont care if the file does not exist, since it will get created the first time around # So we can treat it as the deamon is not running running = False pid = 0 # Get the count for how many users we are following sql = "SELECT count(*) as `userCount` FROM `peepbuzz`.`curators` LEFT JOIN `peepbuzz`.`accounts` ON `accounts`.`account_id` = `curators`.`account_id` WHERE `accounts`.`external_id` IS NOT NULL AND `accounts`.`stream_id` = 1" countQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) try: userCountV = countQ.fetch_row(1,1) userCount = userCountV[0]['userCount'] except: print "Problem fetching the users from the database" sys.exit() # if the count has changed from last count OR the PID is not running start a new deamon if userCount != lastUserCount or not running: try: os.kill(int(pid), 9) except OSError: # Proc is aready dead pass print "Starting the Daemon" os.system("python2.7 " + streamPath + " &") infoModule.info.site['dblink'].close() time.sleep(sleep)
import infoModule infoModule.info.site['remoteLogging'] = False from alogClient import * import mysql_tools import _mysql dblink = mysql_tools.db_connect() infoModule.info.site['log_priority_threshold'] = 100 sql = "alter table peepbuzz.curators drop foreign key curator_unknown_account_id_constraint" print sql mysql_tools.mysqlQuery(sql, dblink) sql = "alter table peepbuzz.curators drop unknown_account_id" print sql mysql_tools.mysqlQuery(sql, dblink) sql = "select * from peepbuzz.unknown_accounts" print sql uaQ = mysql_tools.mysqlQuery(sql, dblink) while True: ua = uaQ.fetch_row(1, 1) if ua == (): break sql = "insert into peepbuzz.accounts set stream_id=" + str( ua[0]['stream_id'] ) + ", account_user_name='" + ua[0]['user_name'] + "', external_id='" + ua[ 0]['external_id'] + "', thumbnail='" + ua[0]['thumbnail'] + "'" print sql mysql_tools.mysqlQuery(sql, dblink)
def findImages (fullArticle, url): ## findImages finds all images in the page and returns them in a dict. This is different from the ## sourceReader version of findImages that only returns the largest image URLParts = urlparse.urlparse(url) myHost = URLParts[1] log.plog('image search: full article len=' + str(len(fullArticle)), 3) imageSearch = fullArticle overrideImageMinSize = False minWidth = 100 minHeight = 100 largestImage = "" largestWidth = 0 largestHeight = 0 largestSize = [] maxSize = 0 imageURLList = [] imgSize = [0,0] ic = 0 #if isinstance(imageSearch, str) == False: # log.plog('imageSearch in findImages is not a string', 4) # return None imageSearch = re.sub('<a.*?href=[\'"]([^\'"]*?\.jpg)[\'"].*?>', '<img src="\\1">', imageSearch, re.I | re.M) imageBucket = re.findall('(<img.*?src=[\'"][^"].*?\.jpg[\'"].*?>)', imageSearch, re.I) images = [] #get last story image link = mysql_tools.db_connect() query = 'select story_id from peepbuzz.stories where url like "%'+str(myHost)+'%" order by story_id DESC LIMIT 1' result = mysql_tools.mysqlQuery(query, link) previous_images = [] pprint.pprint(previous_images) while(1): row = result.fetch_row(0,1) if row == (): break story_id = str(row[0]['story_id']) query = 'select url from peepbuzz.story_images where story_id="'+str(story_id)+'"' result = mysql_tools.mysqlQuery(query, link) while(1): row = result.fetch_row(1,1) if row == (): break previous_images.append(row[0]['url']) #get list of blacklisted images for domain query = 'select full_path from peepbuzz.blacklisted_images where host like "%'+str(myHost)+'%"' result = mysql_tools.mysqlQuery(query, link) blacklisted_images = [] if(result): while(1): row = result.fetch_row(1,1) if row == (): break blacklisted_images.append(row[0]['full_path']) pprint.pprint(previous_images) pprint.pprint(blacklisted_images) for image in imageBucket: log.plog("image: " + image) match = re.search('[\s]*src=["\'](.*?)["\']', image) if match != None: imgUrl = match.group(1) if imgUrl[0:4] != "http": if imgUrl[0:1] == '/': imgUrl = 'http://' +myHost+imgUrl else: imgUrl = 'http://' +myHost+ '/' + imgUrl log.plog("adding site_url to image to get " + imgUrl) # check against last story image if(imgUrl in previous_images): # if match, add to blacklist DB and blackList list here query = 'insert into peepbuzz.blacklisted_images set host = "'+str(myHost)+'", full_path = "'+str(imgUrl)+'"' result = mysql_tools.mysqlQuery(query, link) blacklisted_images.append(imgUrl) if (imgUrl not in imageURLList) and (imgUrl not in blacklisted_images) and (imgUrl[-3:] == 'jpg' or imgUrl[-4:] == 'jpeg'): imageURLList.append(imgUrl) image = re.sub('onclick=\".*?\"', "", image) decWidth = re.search('width=["\']*(\d+)', image, re.I) decHeight = re.search('height=["\']*(\d+)', image, re.I) decWidth2 = re.search('width:\s*(\d+)px', image, re.I) decHeight2 = re.search('width=["\']*(\d+)', image, re.I) if (decWidth != None and decHeight != None) or (decWidth2 != None and decHeight2 != None): if decWidth.group(0) > minWidth and decHeight.group(0) > minHeight : imgSize[0] = int(decWidth.group(1)) imgSize[1] = int(decHeight.group(1)) err = "declared image size: " + str(imgSize[0]) + " x " + str(imgSize[1]) log.plog(err) elif decWidth2.group(0) > minWidth and decHeight2.group(0) > minHeight : imgSize[0] = int(decWidth2.group(0)) imgSize[1] = int(decHeight2.group(0)) err = "declared (via style) image size: " + str(imgSize[0]) + " x " + str(imgSize[1]) log.plog(err) imageErrorLog(err + " Image: " + imgUrl) else: # splitting up the imgUrl to grab the file for local width and height imgSize = getimagesize(imgUrl) if imgSize[0] == 0 or (imgSize[0] < minWidth and imgSize[1] < minHeight): continue err = "fetched image size via getimagesize("+imgUrl+") and got: "+str(imgSize[0])+" x "+str(imgSize[1]) log.plog(err, 1) images.append({"url" : imgUrl, "width" : imgSize[0], "height":imgSize[1]}) if imgSize[0] * imgSize[1] > maxSize and imgSize[0]/imgSize[1] < 3.5 and imgSize[1]/imgSize[0] < 3.5 : maxSize = imgSize[0] * imgSize[1] largestImage = imgUrl largestWidth = imgSize[0] largestHeight = imgSize[1] err = "Image: " + imgUrl +" - Size: "+ str(imgSize[0]) + " x " + str(imgSize[1]) log.plog(err) if largestWidth == 0: return None else: log.plog("findndImages found " + str(len(images)) + " images", 3) return images
) mysql_tools.mysqlQuery(sql, dblink) def testUTF(): dblink = infoModule.info.site["dblink"] sql = u"insert into peepbuzz.stories set title='foo faa \u2026 fum'" # mysql_tools.mysqlQuery(sql, dblink) sql = "select title from peepbuzz.stories" accountsQ = mysql_tools.mysqlQuery(sql, dblink) accounts = accountsQ.fetch_row(0, 1) print accounts[0]["title"] if __name__ == "__main__": infoModule.info.site["dblink"] = mysql_tools.db_connect() infoModule.info.site["log_priority_threshold"] = 100 # streamList is the accumulated json from all streams, declared global because of the multi-threaded call # to fetch streams streamList = [] ## totalStreamsRead is incremented to decide when to exit from stream threading hold pattern totalStreamsRead = 0 # map stream name to id, a common need # use: infoModule.info.site['stream_name_to_id']['twitter'] produces id infoModule.info.site["stream_name_to_id"] = loadStreams.loadStreams() # infoModule.info.source['body_extractor_host'] = "68.68.109.26" # infoModule.info.source['body_extractor_port'] = "1348" # determine the body extractor servers
def main(): sleep = 1 * 60 # Seconds to sleep pidPath = "/tmp/twitterUserStream.pid" streamPath = "twitterUserStream.py" pid = None userCount = None lastUserCount = None noCheck = False running = False # Create a loop while True: infoModule.info.site['dblink'] = mysql_tools.db_connect() # New Loop so move the user counts lastUserCount = userCount userCount = None # Check the file for a PID try: file = open(pidPath) while True: line = file.readline() if not line: break pid = line if pid: # Get the status of the PID try: os.kill(int(pid), 0) except OSError: running = False else: running = True print str(pid) + " - is running?: " + str(running) else: running = False except IOError: # We dont care if the file does not exist, since it will get created the first time around # So we can treat it as the deamon is not running running = False pid = 0 # Get the count for how many users we are following sql = "SELECT count(*) as `userCount` FROM `peepbuzz`.`curators` LEFT JOIN `peepbuzz`.`accounts` ON `accounts`.`account_id` = `curators`.`account_id` WHERE `accounts`.`external_id` IS NOT NULL AND `accounts`.`stream_id` = 1" countQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) try: userCountV = countQ.fetch_row(1, 1) userCount = userCountV[0]['userCount'] except: print "Problem fetching the users from the database" sys.exit() # if the count has changed from last count OR the PID is not running start a new deamon if userCount != lastUserCount or not running: try: os.kill(int(pid), 9) except OSError: # Proc is aready dead pass print "Starting the Daemon" os.system("python2.7 " + streamPath + " &") infoModule.info.site['dblink'].close() time.sleep(sleep)
MyThread(x).start() disasterTimeout = 120.0 #better be able to get these all in 2 minutes startTime = time.time() while True: if explodedCount == len(url_list): break #disaster timeout and crash if time.time() - startTime > disasterTimeout: print "explodeUrls timeout" break return res if __name__ == "__main__": infoModule.info.site['dblink'] = mysql_tools.db_connect() infoModule.info.source['body_extractor_servers'] = [('192.168.0.100', '1348'), ('192.168.0.26', '1348')] URLList = ['http://bit.ly/hrRze1', 'http://twitpic.com/43s13z', 'http://twitpic.com/43ouap', 'http://twitpic.com/43ouak', 'http://fb.me/ETYTKELP', 'http://fb.me/ESd8X8Bl', 'http://bit.ly/gZAJzU', 'http://bit.ly/eBIhUP', 'http://bit.ly/eTzXuu', 'http://bit.ly/glhMcB', 'http://fb.me/QSucj2IG', 'http://fb.me/VnX2tOZn', 'http://ow.ly/i/8ury', 'http://t.co/OwnQba6',
def findImages(fullArticle, url): ## findImages finds all images in the page and returns them in a dict. This is different from the ## sourceReader version of findImages that only returns the largest image URLParts = urlparse.urlparse(url) myHost = URLParts[1] log.plog("image search: full article len=" + str(len(fullArticle)), 3) imageSearch = fullArticle overrideImageMinSize = False minWidth = 100 minHeight = 100 largestImage = "" largestWidth = 0 largestHeight = 0 largestSize = [] maxSize = 0 imageURLList = [] imgSize = [0, 0] ic = 0 # if isinstance(imageSearch, str) == False: # log.plog('imageSearch in findImages is not a string', 4) # return None imageSearch = re.sub("<a.*?href=['\"]([^'\"]*?\.jpg)['\"].*?>", '<img src="\\1">', imageSearch, re.I | re.M) imageBucket = re.findall('(<img.*?src=[\'"][^"].*?\.jpg[\'"].*?>)', imageSearch, re.I) images = [] # get last story image link = mysql_tools.db_connect() query = ( 'select story_id from peepbuzz.stories where url like "%' + str(myHost) + '%" order by story_id DESC LIMIT 1' ) result = mysql_tools.mysqlQuery(query, link) previous_images = [] pprint.pprint(previous_images) while 1: row = result.fetch_row(0, 1) if row == (): break story_id = str(row[0]["story_id"]) query = 'select url from peepbuzz.story_images where story_id="' + str(story_id) + '"' result = mysql_tools.mysqlQuery(query, link) while 1: row = result.fetch_row(1, 1) if row == (): break previous_images.append(row[0]["url"]) # get list of blacklisted images for domain query = 'select full_path from peepbuzz.blacklisted_images where host like "%' + str(myHost) + '%"' result = mysql_tools.mysqlQuery(query, link) blacklisted_images = [] if result: while 1: row = result.fetch_row(1, 1) if row == (): break blacklisted_images.append(row[0]["full_path"]) pprint.pprint(previous_images) pprint.pprint(blacklisted_images) for image in imageBucket: log.plog("image: " + image) match = re.search("[\s]*src=[\"'](.*?)[\"']", image) if match != None: imgUrl = match.group(1) if imgUrl[0:4] != "http": if imgUrl[0:1] == "/": imgUrl = "http://" + myHost + imgUrl else: imgUrl = "http://" + myHost + "/" + imgUrl log.plog("adding site_url to image to get " + imgUrl) # check against last story image if imgUrl in previous_images: # if match, add to blacklist DB and blackList list here query = ( 'insert into peepbuzz.blacklisted_images set host = "' + str(myHost) + '", full_path = "' + str(imgUrl) + '"' ) result = mysql_tools.mysqlQuery(query, link) blacklisted_images.append(imgUrl) if ( (imgUrl not in imageURLList) and (imgUrl not in blacklisted_images) and (imgUrl[-3:] == "jpg" or imgUrl[-4:] == "jpeg") ): imageURLList.append(imgUrl) image = re.sub('onclick=".*?"', "", image) decWidth = re.search("width=[\"']*(\d+)", image, re.I) decHeight = re.search("height=[\"']*(\d+)", image, re.I) decWidth2 = re.search("width:\s*(\d+)px", image, re.I) decHeight2 = re.search("width=[\"']*(\d+)", image, re.I) if (decWidth != None and decHeight != None) or (decWidth2 != None and decHeight2 != None): if decWidth.group(0) > minWidth and decHeight.group(0) > minHeight: imgSize[0] = int(decWidth.group(1)) imgSize[1] = int(decHeight.group(1)) err = "declared image size: " + str(imgSize[0]) + " x " + str(imgSize[1]) log.plog(err) elif decWidth2.group(0) > minWidth and decHeight2.group(0) > minHeight: imgSize[0] = int(decWidth2.group(0)) imgSize[1] = int(decHeight2.group(0)) err = "declared (via style) image size: " + str(imgSize[0]) + " x " + str(imgSize[1]) log.plog(err) imageErrorLog(err + " Image: " + imgUrl) else: # splitting up the imgUrl to grab the file for local width and height imgSize = getimagesize(imgUrl) if imgSize[0] == 0 or (imgSize[0] < minWidth and imgSize[1] < minHeight): continue err = ( "fetched image size via getimagesize(" + imgUrl + ") and got: " + str(imgSize[0]) + " x " + str(imgSize[1]) ) log.plog(err, 1) images.append({"url": imgUrl, "width": imgSize[0], "height": imgSize[1]}) if ( imgSize[0] * imgSize[1] > maxSize and imgSize[0] / imgSize[1] < 3.5 and imgSize[1] / imgSize[0] < 3.5 ): maxSize = imgSize[0] * imgSize[1] largestImage = imgUrl largestWidth = imgSize[0] largestHeight = imgSize[1] err = "Image: " + imgUrl + " - Size: " + str(imgSize[0]) + " x " + str(imgSize[1]) log.plog(err) if largestWidth == 0: return None else: log.plog("findndImages found " + str(len(images)) + " images", 3) return images