Exemplo n.º 1
0
def processWallpaper(url):
	wallpaperSoup = getSoup(url)
	wallpaperOriginalUrl = wallpaperSoup.find('span', { "class" : "btn btn-success download-button" })['data-href']
	sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl + "\n\t\t\t")
	categories = wallpaperSoup.find('div', { "class" : "floatright" }).findAll('strong')
	name = wallpaperSoup.find('div', {'class': 'container center'}).find('div').text.strip().replace("/",".")
	tags = wallpaperSoup.findAll('div', {'style': 'padding:5px 10px; margin:1px; display:inline-block;'})
	tagArray = [None]*len(tags)
	taglist = ""
	index = 0
	if len(tags) > 0:
		for tag in tags:
			tagArray[index] = tag.text.strip()
			index += 1
	tagArray.sort()
	for tag in tagArray:
		taglist += "[" + tag + "]"
	fileName = taglist + name + ((" " if len(taglist) > 0 else "") if len(name) == 0 else " - ") + wallpaperOriginalUrl.split('/')[-4]  + "." + wallpaperOriginalUrl.split('/')[-2]
	directoryStructure = baseDir
	for i in range(0, len(categories)):
		sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => "))
		directoryStructure += categories[i].text.strip() + "/"
	sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n")
	ensureDir(directoryStructure)
	retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName)
	if int(retval) == 42 and update:
		global stop
		stop = True
def processWallpaper(url):
    wallpaperSoup = getSoup(url)
    wallpaperOriginalUrl = wallpaperSoup.find(
        'span', {"class": "btn btn-success download-button"})['data-href']
    sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl +
                     "\n\t\t\t")
    categories = wallpaperSoup.find('div', {
        "class": "floatright"
    }).findAll('strong')
    fileName = wallpaperOriginalUrl.split(
        '/')[-4] + "." + wallpaperOriginalUrl.split('/')[-2]
    directoryStructure = baseDir
    for i in range(0, len(categories)):
        sys.stdout.write(categories[i].text.strip() +
                         ("" if i == (len(categories) - 1) else " => "))
        directoryStructure += categories[i].text.strip() + "/"
    sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName +
                     "\n")
    ensureDir(directoryStructure)
    retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t",
                    fileName)
    if int(retval) == 42 and update:
        global stop
        stop = True
Exemplo n.º 3
0
baseDir = "/root/econtalk.org/"
baseUrl = "http://www.econtalk.org/"
archiveSoup = getSoup(baseUrl + "archives.html")

tableRows = archiveSoup.find(
    'div', {
        'class': 'archive-individual archive-date-based archive'
    }).findAll('tr')
for tableRow in tableRows:
    if tableRows.index(tableRow) == 0:
        continue
    date = datetime.strptime(
        tableRow.find('td', {
            'width': '5%'
        }).text.strip(), "%Y/%m/%d")
    extra = len(tableRow.findAll('td')[2].text.strip()) != 0
    name = tableRow.find('a').text
    dirName = date.strftime("%Y-%m-%d") + (" Extra " if extra else
                                           " ") + "- " + name + "/"
    url = tableRow.find('a')['href']
    ensureDir(baseDir + dirName)
    print(dirName[:-1])
    if not extra:
        podcastSoup = getSoup(url)
        url1 = podcastSoup.find('a', text="Download")['href']
        print("\t" + url1)
        fileDl(url1, baseDir + dirName, "\t\t")
    print("\t" + url)
    fileDl(url, baseDir + dirName, "\t\t")
Exemplo n.º 4
0
    pageSoup = getSoup("http://thechive.com/page/" + str(page) + "/")
    print("Page " + str(page) + " of " + str(lastPage))
    for article in pageSoup.findAll('article', {"role": "article"}):
        date = article.find('time').text.strip()
        h3 = article.find('h3', {"class": "post-title entry-title card-title"})
        name = h3.text.strip()
        url = h3.find('a')['href']
        if any(x in name for x in filter):
            print("\tName: " + name + "\n\t\tDate: " + date)
            dateFolder = "NonParsable/"
            try:
                dateFolder = datetime.strptime(
                    date, '%b %d, %Y').strftime("%Y/%m/%d/")
            except ValueError:
                print("\t\tGoing to NonParsable folder")
            ensureDir(baseDir + dateFolder + name + "/")
            postSoup = getSoup(url)
            for countTag in postSoup.findAll('div', {"class": "count-tag"}):
                try:
                    img = countTag.parent.find('img')
                    imgSrc = img['src'].split('?')[0] + "?quality=100"
                    imgName = img['src'].split('?')[0].split('/')[-1]
                    if any(x in img['class'] for x in {"gif-animate"}):
                        imgSrc = img['data-gifsrc'].split('?')[0]
                        imgName = img['data-gifsrc'].split('?')[0].split(
                            '/')[-1]
                    print("\t\t\tImage" + countTag.text + ": " + imgSrc)
                    fileDl(imgSrc, baseDir + dateFolder + name + "/",
                           "\t\t\t\t", imgName)
                except:
Exemplo n.º 5
0
            if getStatus(baseUrl + str(last + 1) + "/") == 404:
                return last
        newIncr = 1
    return findLastPage(newIncr, lastPlusInc, newStatus)


print("Invoking findLastPage()")
lastPage = findLastPage()
print("Last page is " + str(lastPage))

for page in range(0, lastPage + 1):
    pageSoup = getSoup("http://thechive.com/page/" + str(page) + "/")
    print("Page " + str(page) + " of " + str(lastPage))
    for article in pageSoup.findAll('article', {"role": "article"}):
        date = article.find('time').text.strip()
        h3 = article.find('h3', {"class": "post-title entry-title card-title"})
        name = h3.text.strip()
        url = h3.find('a')['href']
        if any(x in name for x in filter):
            print("\tName: " + name + "\n\t\tDate: " + date)
            ensureDir(baseDir + name + "/")
            postSoup = getSoup(url)
            for countTag in postSoup.findAll('div', {"class": "count-tag"}):
                imgSrc = countTag.parent.find('img')['src'].split(
                    '?')[0] + "?quality=100"
                print("\t\t\tImage" + countTag.text + ": " + imgSrc)
                fileDl(
                    imgSrc, baseDir + name + "/", "\t\t\t\t",
                    countTag.parent.find('img')['src'].split('?')[0].split('/')
                    [-1])
Exemplo n.º 6
0
if (len(sys.argv) < 2) or sys.argv[1] == "--help":
    print(
        "Scraping script for dictionary.com/wordoftheday/ first WOTD: 1999/5/3"
    )
    print(
        "Usage:\n\tAll until today: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\""
    )
    print(
        "\tSpecific date: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\" yyyy/mm/dd -single"
    )
    print(
        "\tDate range: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\" yyyy/mm/dd yyyy/mm/dd"
    )
    sys.exit()

startDate = date(1999, 5, 3)
endDate = date.today()
ensureDir(sys.argv[1])

if len(sys.argv) == 4 and sys.argv[3] == "-single":
    startDate = datetime.strptime(sys.argv[2], "%Y/%m/%d")
    print startDate
    dlForDate(startDate)
    sys.exit()
elif len(sys.argv) == 4 and sys.argv[3] != "-single":
    startDate = datetime.strptime(sys.argv[2], "%Y/%m/%d")
    endDate = datetime.strptime(sys.argv[3], "%Y/%m/%d")

for singleDate in daterange(startDate, endDate):
    dlForDate(singleDate)
Exemplo n.º 7
0
                          "content.php?show=videos&section=37&page=" +
                          sys.argv[1])
base64string = base64.encodestring('%s:%s' % (username, password)).replace(
    '\n', '')
request.add_header("Authorization", "Basic %s" % base64string)
result = urllib2.urlopen(request)

rootSoup = BeautifulSoup(result, "lxml")

for table in rootSoup.findAll('table', {"bgcolor": "#1d1d1d"}):
    anchor = table.find('a', {"style": "color:#FF0000"})
    name = anchor.text.strip()
    videoUrl = baseUrl + anchor['href']
    description = table.findAll('tr')[1].find('div').text.strip()
    dirName = baseDir + "pantyhoseplaza.com/" + name.replace(":", "")
    ensureDir(dirName)
    print(name + "\n\t" + videoUrl)
    requestVid = urllib2.Request(videoUrl)
    requestVid.add_header("Authorization", "Basic %s" % base64string)
    resultVid = urllib2.urlopen(requestVid)
    vidSoup = BeautifulSoup(resultVid, "lxml")
    imageUrl = baseUrl + vidSoup.find('img',
                                      {"style": "border-color:#990000"})['src']
    print("\tIMAGE: " + imageUrl)
    fileDlWithAuth(imageUrl, base64string, dirName + "/", "\t")
    data = {'Name': name, 'Description': description}
    with open(dirName + '/data.json', 'w') as outfile:
        json.dump(data, outfile)
    for vidDiv in vidSoup.findAll('div'):
        if regex.match(vidDiv.text.strip()):
            trueVideoUrl = baseUrl + vidDiv.find('a')['href']
        simple = True
    elif o in ("-h", "--help"):
        usage()
        sys.exit()
    elif o == "--from":
        startDoc = int(a)
    elif o == "--to":
        endDoc = int(a)
    elif o == "--save-dir":
        baseDir = a
    else:
        usage()
        assert False, "unhandled option"
        sys.exit()

ensureDir(baseDir)
documentSoup = getSoup(baseUrl + "0")
documentTotal = int(
    documentSoup.find('div', {
        "id": "PageRange"
    }).text.split('of')[1].strip().replace(',', ''))
print str(documentTotal) + " documents to download. Let's get started!"
if endDoc == -1:
    endDoc = documentTotal

documentNum = 1 + roundDownTo(startDoc, 50)
print(
    "Scanning range from " + str(roundDownTo(startDoc, 50)) + " to " +
    str(documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo(
        endDoc, 50)))
for i in range(