Python getSoup示例，functions.getSoup Python示例

示例#1

0

显示文件

def processWallpaper(url):
	wallpaperSoup = getSoup(url)
	wallpaperOriginalUrl = wallpaperSoup.find('span', { "class" : "btn btn-success download-button" })['data-href']
	sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl + "\n\t\t\t")
	categories = wallpaperSoup.find('div', { "class" : "floatright" }).findAll('strong')
	name = wallpaperSoup.find('div', {'class': 'container center'}).find('div').text.strip().replace("/",".")
	tags = wallpaperSoup.findAll('div', {'style': 'padding:5px 10px; margin:1px; display:inline-block;'})
	tagArray = [None]*len(tags)
	taglist = ""
	index = 0
	if len(tags) > 0:
		for tag in tags:
			tagArray[index] = tag.text.strip()
			index += 1
	tagArray.sort()
	for tag in tagArray:
		taglist += "[" + tag + "]"
	fileName = taglist + name + ((" " if len(taglist) > 0 else "") if len(name) == 0 else " - ") + wallpaperOriginalUrl.split('/')[-4]  + "." + wallpaperOriginalUrl.split('/')[-2]
	directoryStructure = baseDir
	for i in range(0, len(categories)):
		sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => "))
		directoryStructure += categories[i].text.strip() + "/"
	sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n")
	ensureDir(directoryStructure)
	retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName)
	if int(retval) == 42 and update:
		global stop
		stop = True

示例#2

0

显示文件

def dlForDate(singleDate):
    print("Getting Word of the Day for: " + singleDate.strftime("%Y/%m/%d"))
    wordSoup = getSoup("http://www.dictionary.com/wordoftheday/" +
                       singleDate.strftime("%Y/%m/%d") + "/")
    url = wordSoup.find('meta', {"property": "og:image"})['content']
    print("\tDownloading:" + url)
    fileDl(url, sys.argv[1], "\t\t")

示例#3

0

显示文件

文件： wall.alphacoders.com.py 项目： n8wachT/Scraping-Scripts

def processWallpaper(url):
    wallpaperSoup = getSoup(url)
    wallpaperOriginalUrl = wallpaperSoup.find(
        'span', {"class": "btn btn-success download-button"})['data-href']
    sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl +
                     "\n\t\t\t")
    categories = wallpaperSoup.find('div', {
        "class": "floatright"
    }).findAll('strong')
    fileName = wallpaperOriginalUrl.split(
        '/')[-4] + "." + wallpaperOriginalUrl.split('/')[-2]
    directoryStructure = baseDir
    for i in range(0, len(categories)):
        sys.stdout.write(categories[i].text.strip() +
                         ("" if i == (len(categories) - 1) else " => "))
        directoryStructure += categories[i].text.strip() + "/"
    sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName +
                     "\n")
    ensureDir(directoryStructure)
    retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t",
                    fileName)
    if int(retval) == 42 and update:
        global stop
        stop = True

示例#4

0

显示文件

    opts, args = getopt.getopt(sys.argv[1:], 'hi:')
except getopt.GetoptError:
    usage()
    sys.exit(2)
for opt, arg in opts:
    if opt == '-h':
        usage()
        sys.exit()
    elif opt == '-i':
        imageFile = arg
    else:
        print("Unsupported option and/or argument")
        sys.exit(2)

print("Input file is: " + imageFile)
iqdbSoup = getSoup("http://iqdb.org/", {}, {'file': open(imageFile, 'rb')})
#print(iqdbSoup.find('div', {'class': 'pages'}).prettify())
for result in iqdbSoup.find('div', {'class': 'pages'}).findAll('table'):
    t1 = result.findAll('tr')[0].findAll('th')[0].text
    if t1 != "Your image":
        #print(result.prettify())
        print("Image Info:")
        print("\t" + t1)
        t2 = result.find('td', {'class': 'image'}).find('a')['href']
        if t2[:2] == "//":
            t2 = "http:" + t2
        print("\t\tSource:\t\t" + t2)
        t3 = result.find('img', {'class': 'service-icon'}).nextSibling
        print("\t\tSource Page:\t" + t3)
        whs = result.findAll('tr')[3].find('td').text.split(' ')
        width = int(whs[0].split('×')[0])

示例#5

0

显示文件

#!/usr/bin/env python
import sys
from bs4 import BeautifulSoup
from functions import getSoup, fileDl, ensureDir
from datetime import datetime

baseDir = "/root/econtalk.org/"
baseUrl = "http://www.econtalk.org/"
archiveSoup = getSoup(baseUrl + "archives.html")

tableRows = archiveSoup.find(
    'div', {
        'class': 'archive-individual archive-date-based archive'
    }).findAll('tr')
for tableRow in tableRows:
    if tableRows.index(tableRow) == 0:
        continue
    date = datetime.strptime(
        tableRow.find('td', {
            'width': '5%'
        }).text.strip(), "%Y/%m/%d")
    extra = len(tableRow.findAll('td')[2].text.strip()) != 0
    name = tableRow.find('a').text
    dirName = date.strftime("%Y-%m-%d") + (" Extra " if extra else
                                           " ") + "- " + name + "/"
    url = tableRow.find('a')['href']
    ensureDir(baseDir + dirName)
    print(dirName[:-1])
    if not extra:
        podcastSoup = getSoup(url)
        url1 = podcastSoup.find('a', text="Download")['href']

示例#6

0

显示文件

文件： thechive.com.py 项目： sak3r/Scraping-Scripts

          " next status should be " + str(shouldBe) +
          ", in/decrement will be " + str(newIncr))
    if increment == 1:
        if lastStatus == 200:
            if getStatus(baseUrl + str(last + 1) + "/") == 404:
                return last
        newIncr = 1
    return findLastPage(newIncr, lastPlusInc, newStatus)


print("Invoking findLastPage()")
lastPage = findLastPage()
print("Last page is " + str(lastPage))

for page in range(0, lastPage + 1):
    pageSoup = getSoup("http://thechive.com/page/" + str(page) + "/")
    print("Page " + str(page) + " of " + str(lastPage))
    for article in pageSoup.findAll('article', {"role": "article"}):
        date = article.find('time').text.strip()
        h3 = article.find('h3', {"class": "post-title entry-title card-title"})
        name = h3.text.strip()
        url = h3.find('a')['href']
        if any(x in name for x in filter):
            print("\tName: " + name + "\n\t\tDate: " + date)
            dateFolder = "NonParsable/"
            try:
                dateFolder = datetime.strptime(
                    date, '%b %d, %Y').strftime("%Y/%m/%d/")
            except ValueError:
                print("\t\tGoing to NonParsable folder")
            ensureDir(baseDir + dateFolder + name + "/")

示例#7

0

显示文件

	index = 0
	if len(tags) > 0:
		for tag in tags:
			tagArray[index] = tag.text.strip()
			index += 1
	tagArray.sort()
	for tag in tagArray:
		taglist += "[" + tag + "]"
	fileName = taglist + name + ((" " if len(taglist) > 0 else "") if len(name) == 0 else " - ") + wallpaperOriginalUrl.split('/')[-4]  + "." + wallpaperOriginalUrl.split('/')[-2]
	directoryStructure = baseDir
	for i in range(0, len(categories)):
		sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => "))
		directoryStructure += categories[i].text.strip() + "/"
	sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n")
	ensureDir(directoryStructure)
	retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName)
	if int(retval) == 42 and update:
		global stop
		stop = True

wallSoup = getSoup(baseUrl + "0")
totalPages = int(wallSoup.find('ul', { "class" : "pagination pagination" }).findAll('li')[-1].find('a')['href'].split('=')[1])
for i in range(0, totalPages+1):
	print("Scraping page " + str(i) + "...")
	for thumbContainer in getSoup(baseUrl + str(i)).findAll('div', { "class" : "thumb-container-big " }):
		wallpaperUrl = bUrl + thumbContainer.find('a')['href']
		print ("\tbig.php url: " + wallpaperUrl)
		processWallpaper(wallpaperUrl)
		if stop:
			sys.exit(420)

示例#8

0

显示文件

文件： thesandbornmaps.cudl.colorado.edu.py 项目： sak3r/Scraping-Scripts

    elif o in ("-h", "--help"):
        usage()
        sys.exit()
    elif o == "--from":
        startDoc = int(a)
    elif o == "--to":
        endDoc = int(a)
    elif o == "--save-dir":
        baseDir = a
    else:
        usage()
        assert False, "unhandled option"
        sys.exit()

ensureDir(baseDir)
documentSoup = getSoup(baseUrl + "0")
documentTotal = int(
    documentSoup.find('div', {
        "id": "PageRange"
    }).text.split('of')[1].strip().replace(',', ''))
print str(documentTotal) + " documents to download. Let's get started!"
if endDoc == -1:
    endDoc = documentTotal

documentNum = 1 + roundDownTo(startDoc, 50)
print(
    "Scanning range from " + str(roundDownTo(startDoc, 50)) + " to " +
    str(documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo(
        endDoc, 50)))
for i in range(
        roundDownTo(startDoc, 50),