def processWallpaper(url): wallpaperSoup = getSoup(url) wallpaperOriginalUrl = wallpaperSoup.find('span', { "class" : "btn btn-success download-button" })['data-href'] sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl + "\n\t\t\t") categories = wallpaperSoup.find('div', { "class" : "floatright" }).findAll('strong') name = wallpaperSoup.find('div', {'class': 'container center'}).find('div').text.strip().replace("/",".") tags = wallpaperSoup.findAll('div', {'style': 'padding:5px 10px; margin:1px; display:inline-block;'}) tagArray = [None]*len(tags) taglist = "" index = 0 if len(tags) > 0: for tag in tags: tagArray[index] = tag.text.strip() index += 1 tagArray.sort() for tag in tagArray: taglist += "[" + tag + "]" fileName = taglist + name + ((" " if len(taglist) > 0 else "") if len(name) == 0 else " - ") + wallpaperOriginalUrl.split('/')[-4] + "." + wallpaperOriginalUrl.split('/')[-2] directoryStructure = baseDir for i in range(0, len(categories)): sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => ")) directoryStructure += categories[i].text.strip() + "/" sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n") ensureDir(directoryStructure) retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName) if int(retval) == 42 and update: global stop stop = True
def processWallpaper(url): wallpaperSoup = getSoup(url) wallpaperOriginalUrl = wallpaperSoup.find( 'span', {"class": "btn btn-success download-button"})['data-href'] sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl + "\n\t\t\t") categories = wallpaperSoup.find('div', { "class": "floatright" }).findAll('strong') fileName = wallpaperOriginalUrl.split( '/')[-4] + "." + wallpaperOriginalUrl.split('/')[-2] directoryStructure = baseDir for i in range(0, len(categories)): sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => ")) directoryStructure += categories[i].text.strip() + "/" sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n") ensureDir(directoryStructure) retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName) if int(retval) == 42 and update: global stop stop = True
baseDir = "/root/econtalk.org/" baseUrl = "http://www.econtalk.org/" archiveSoup = getSoup(baseUrl + "archives.html") tableRows = archiveSoup.find( 'div', { 'class': 'archive-individual archive-date-based archive' }).findAll('tr') for tableRow in tableRows: if tableRows.index(tableRow) == 0: continue date = datetime.strptime( tableRow.find('td', { 'width': '5%' }).text.strip(), "%Y/%m/%d") extra = len(tableRow.findAll('td')[2].text.strip()) != 0 name = tableRow.find('a').text dirName = date.strftime("%Y-%m-%d") + (" Extra " if extra else " ") + "- " + name + "/" url = tableRow.find('a')['href'] ensureDir(baseDir + dirName) print(dirName[:-1]) if not extra: podcastSoup = getSoup(url) url1 = podcastSoup.find('a', text="Download")['href'] print("\t" + url1) fileDl(url1, baseDir + dirName, "\t\t") print("\t" + url) fileDl(url, baseDir + dirName, "\t\t")
pageSoup = getSoup("http://thechive.com/page/" + str(page) + "/") print("Page " + str(page) + " of " + str(lastPage)) for article in pageSoup.findAll('article', {"role": "article"}): date = article.find('time').text.strip() h3 = article.find('h3', {"class": "post-title entry-title card-title"}) name = h3.text.strip() url = h3.find('a')['href'] if any(x in name for x in filter): print("\tName: " + name + "\n\t\tDate: " + date) dateFolder = "NonParsable/" try: dateFolder = datetime.strptime( date, '%b %d, %Y').strftime("%Y/%m/%d/") except ValueError: print("\t\tGoing to NonParsable folder") ensureDir(baseDir + dateFolder + name + "/") postSoup = getSoup(url) for countTag in postSoup.findAll('div', {"class": "count-tag"}): try: img = countTag.parent.find('img') imgSrc = img['src'].split('?')[0] + "?quality=100" imgName = img['src'].split('?')[0].split('/')[-1] if any(x in img['class'] for x in {"gif-animate"}): imgSrc = img['data-gifsrc'].split('?')[0] imgName = img['data-gifsrc'].split('?')[0].split( '/')[-1] print("\t\t\tImage" + countTag.text + ": " + imgSrc) fileDl(imgSrc, baseDir + dateFolder + name + "/", "\t\t\t\t", imgName) except:
if getStatus(baseUrl + str(last + 1) + "/") == 404: return last newIncr = 1 return findLastPage(newIncr, lastPlusInc, newStatus) print("Invoking findLastPage()") lastPage = findLastPage() print("Last page is " + str(lastPage)) for page in range(0, lastPage + 1): pageSoup = getSoup("http://thechive.com/page/" + str(page) + "/") print("Page " + str(page) + " of " + str(lastPage)) for article in pageSoup.findAll('article', {"role": "article"}): date = article.find('time').text.strip() h3 = article.find('h3', {"class": "post-title entry-title card-title"}) name = h3.text.strip() url = h3.find('a')['href'] if any(x in name for x in filter): print("\tName: " + name + "\n\t\tDate: " + date) ensureDir(baseDir + name + "/") postSoup = getSoup(url) for countTag in postSoup.findAll('div', {"class": "count-tag"}): imgSrc = countTag.parent.find('img')['src'].split( '?')[0] + "?quality=100" print("\t\t\tImage" + countTag.text + ": " + imgSrc) fileDl( imgSrc, baseDir + name + "/", "\t\t\t\t", countTag.parent.find('img')['src'].split('?')[0].split('/') [-1])
if (len(sys.argv) < 2) or sys.argv[1] == "--help": print( "Scraping script for dictionary.com/wordoftheday/ first WOTD: 1999/5/3" ) print( "Usage:\n\tAll until today: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\"" ) print( "\tSpecific date: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\" yyyy/mm/dd -single" ) print( "\tDate range: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\" yyyy/mm/dd yyyy/mm/dd" ) sys.exit() startDate = date(1999, 5, 3) endDate = date.today() ensureDir(sys.argv[1]) if len(sys.argv) == 4 and sys.argv[3] == "-single": startDate = datetime.strptime(sys.argv[2], "%Y/%m/%d") print startDate dlForDate(startDate) sys.exit() elif len(sys.argv) == 4 and sys.argv[3] != "-single": startDate = datetime.strptime(sys.argv[2], "%Y/%m/%d") endDate = datetime.strptime(sys.argv[3], "%Y/%m/%d") for singleDate in daterange(startDate, endDate): dlForDate(singleDate)
"content.php?show=videos§ion=37&page=" + sys.argv[1]) base64string = base64.encodestring('%s:%s' % (username, password)).replace( '\n', '') request.add_header("Authorization", "Basic %s" % base64string) result = urllib2.urlopen(request) rootSoup = BeautifulSoup(result, "lxml") for table in rootSoup.findAll('table', {"bgcolor": "#1d1d1d"}): anchor = table.find('a', {"style": "color:#FF0000"}) name = anchor.text.strip() videoUrl = baseUrl + anchor['href'] description = table.findAll('tr')[1].find('div').text.strip() dirName = baseDir + "pantyhoseplaza.com/" + name.replace(":", "") ensureDir(dirName) print(name + "\n\t" + videoUrl) requestVid = urllib2.Request(videoUrl) requestVid.add_header("Authorization", "Basic %s" % base64string) resultVid = urllib2.urlopen(requestVid) vidSoup = BeautifulSoup(resultVid, "lxml") imageUrl = baseUrl + vidSoup.find('img', {"style": "border-color:#990000"})['src'] print("\tIMAGE: " + imageUrl) fileDlWithAuth(imageUrl, base64string, dirName + "/", "\t") data = {'Name': name, 'Description': description} with open(dirName + '/data.json', 'w') as outfile: json.dump(data, outfile) for vidDiv in vidSoup.findAll('div'): if regex.match(vidDiv.text.strip()): trueVideoUrl = baseUrl + vidDiv.find('a')['href']
simple = True elif o in ("-h", "--help"): usage() sys.exit() elif o == "--from": startDoc = int(a) elif o == "--to": endDoc = int(a) elif o == "--save-dir": baseDir = a else: usage() assert False, "unhandled option" sys.exit() ensureDir(baseDir) documentSoup = getSoup(baseUrl + "0") documentTotal = int( documentSoup.find('div', { "id": "PageRange" }).text.split('of')[1].strip().replace(',', '')) print str(documentTotal) + " documents to download. Let's get started!" if endDoc == -1: endDoc = documentTotal documentNum = 1 + roundDownTo(startDoc, 50) print( "Scanning range from " + str(roundDownTo(startDoc, 50)) + " to " + str(documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo( endDoc, 50))) for i in range(