def getSearchImgs(query, limit=None): '''Gets the src of images on tumblr tagged with query''' query = query.replace(' ', '%20') searchPage = 'http://www.tumblr.com/tagged/' + query elems = getElementsFromUrl(searchPage, 'a.go') postUrls = [e.get('href') for e in elems] if limit is None or limit > len(postUrls): limit = len(postUrls) #print postUrls imageSrcs = [] for i, postUrl in enumerate(postUrls[:limit], 1): print i, '/', limit, postUrl #Find pictures directly on post newSrcs = cleanImgSrcs(getImgSrcs(postUrl)) print '\tFound :', len(newSrcs) imageSrcs += newSrcs #Find pictures in post iframe elems = getElementsFromUrl(postUrl, 'iframe.photoset') iframeUrls = [e.get('src') for e in elems] for iframeUrl in iframeUrls: print '\tiframe:', iframeUrl iframeImageSrcs = cleanImgSrcs(getImgSrcs(iframeUrl)) print '\tFound :', len(iframeImageSrcs) imageSrcs += iframeImageSrcs return imageSrcs
def getImgurImageSrcs(href): '''Returns a list of the src parametre of the image(s) from the page''' ret = [] for e in getElementsFromUrl(href, 'div#image img'): src = e.get('src') src = re.sub('\?.*', '', src) # remove trailing parameters ret.append(src) return ret
def getAdmittedStudents(program, year): '''Returns the name of the students who joined the program in a given year''' href = 'http://eduportfolio.org/groupes/view/portfolio_{0}{1}{2}'\ .format(program.lower(), year, str(year + 1)[2:]) ret = [] for row in getElementsFromUrl(href, 'td:nth-child(2)'): name = row.text.rstrip() ret.append(name) return ret
def getImgurGalleryHrefTitle(galleryAddress): '''Returns tuples of page href and img title href points to the page containing the image(s) not the actual image''' ret = [] for e in getElementsFromUrl(galleryAddress, 'div.post > a'): src = urllib2.urlparse.urljoin(galleryAddress, e.get('href')) imgTag = e.find('img') title = imgTag.get('title') title = re.sub('<p>.+?</p>', '', title) # Remove nbr of views ret.append((src, title)) return ret
def downloadImgur(href, path=''): '''Detects the type of url and does the appropriate download''' if 'gallery/' in href: downloadImgurPage(href, path) elif '/r/' in href: downloadImgurGallery(href, path) elif href[-4] == '.': #possibly a pic ex .jpg, .png downloadResource(href, destPath=path) else: imgBox = getElementsFromUrl(href, 'div.image.textbox > a') for e in imgBox: src = e.get('href') downloadResource(src, destPath=path)
def scrapOreily(indexUrl, outName): '''Generates an html page from the index located at indexUrl''' links = scraptools.getElementsFromUrl(url, '#bodyContent ol a:nth-child(1)') f = open(outName, 'w') f.write(head) f.write(getHTMLContent(indexUrl)) for link in links: relativeLink = link.get('href') print relativeLink absoluteLink = urlparse.urljoin(url, relativeLink) f.write(getHTMLContent(absoluteLink)) f.write('</body></html>') f.close()
def getImgSrcs(url): '''returns the src attribute of all images on a page''' elems = getElementsFromUrl(url, 'img') imgSrcs = [e.get('src') for e in elems] return imgSrcs
'''Shows all masculin first names from french website''' from scraptools import getElementsFromUrl, urlIterator startUrl = 'http://www.quelprenom.com/prenom-garcon.php' nextCssSelector = 'span.button-right' #Iterate on all the pages successively for href in urlIterator(startUrl, nextCssSelector): #Iterate on all the names of a page for nameTag in getElementsFromUrl(href, '.prenom-lien'): print nameTag.text
def getHTMLContent(url): '''Get html code of main content of a url on Oreilly''' elems = scraptools.getElementsFromUrl(url, '#content') content = elems[0] return etree.tostring(content, pretty_print=True, method="html")