Пример #1
0
def getSearchImgs(query, limit=None):
    '''Gets the src of images on tumblr tagged with query'''
    query = query.replace(' ', '%20')
    searchPage = 'http://www.tumblr.com/tagged/' + query
    
    elems = getElementsFromUrl(searchPage, 'a.go')
    
    postUrls = [e.get('href') for e in elems]
    if limit is None or limit > len(postUrls):
        limit = len(postUrls)
    #print postUrls
    imageSrcs = []
    for i, postUrl in enumerate(postUrls[:limit], 1):
        print i, '/', limit, postUrl
        
        #Find pictures directly on post
        newSrcs = cleanImgSrcs(getImgSrcs(postUrl))
        print '\tFound :', len(newSrcs)
        imageSrcs += newSrcs
        
        #Find pictures in post iframe
        elems = getElementsFromUrl(postUrl, 'iframe.photoset')
        iframeUrls = [e.get('src') for e in elems]
        for iframeUrl in iframeUrls:
            print '\tiframe:', iframeUrl
            iframeImageSrcs = cleanImgSrcs(getImgSrcs(iframeUrl))
            print '\tFound :', len(iframeImageSrcs)
            imageSrcs += iframeImageSrcs

    return imageSrcs
Пример #2
0
def getImgurImageSrcs(href):
    '''Returns a list of the src parametre of the image(s) from the page'''
    ret = []
    for e in getElementsFromUrl(href, 'div#image img'):
        src = e.get('src')
        src = re.sub('\?.*', '', src)  # remove trailing parameters
        ret.append(src)
    return ret
def getAdmittedStudents(program, year):
    '''Returns the name of the students who joined the program in a given year'''
    href = 'http://eduportfolio.org/groupes/view/portfolio_{0}{1}{2}'\
    .format(program.lower(), year, str(year + 1)[2:])

    ret = []
    for row in getElementsFromUrl(href, 'td:nth-child(2)'):
        name = row.text.rstrip()
        ret.append(name)
    return ret
Пример #4
0
def getImgurGalleryHrefTitle(galleryAddress):
    '''Returns tuples of page href and img title
    href points to the page containing the image(s) not the actual image'''
    
    ret = []
    for e in getElementsFromUrl(galleryAddress, 'div.post > a'):
        src = urllib2.urlparse.urljoin(galleryAddress, e.get('href'))
        imgTag = e.find('img')
        title = imgTag.get('title')
        title = re.sub('<p>.+?</p>', '', title)  # Remove nbr of views
        ret.append((src, title))
    return ret
Пример #5
0
def downloadImgur(href, path=''):
    '''Detects the type of url and does the appropriate download'''
    if 'gallery/' in href:
        downloadImgurPage(href, path)
    elif '/r/' in href:
        downloadImgurGallery(href, path)
    elif href[-4] == '.': #possibly a pic ex .jpg, .png
        downloadResource(href, destPath=path)
    else:
        imgBox = getElementsFromUrl(href, 'div.image.textbox > a')
        for e in imgBox:
            src = e.get('href')
            downloadResource(src, destPath=path)
def scrapOreily(indexUrl, outName):
    '''Generates an html page from the index located at indexUrl'''
    links = scraptools.getElementsFromUrl(url, '#bodyContent ol a:nth-child(1)')
    
    f = open(outName, 'w')
    
    f.write(head)
    
    f.write(getHTMLContent(indexUrl))
    
    for link in links:
        relativeLink = link.get('href')
        print relativeLink
        absoluteLink = urlparse.urljoin(url, relativeLink)
        
        f.write(getHTMLContent(absoluteLink))
        
    f.write('</body></html>')
    f.close()
Пример #7
0
def getImgSrcs(url):
    '''returns the src attribute of all images on a page'''
    elems = getElementsFromUrl(url, 'img')
    imgSrcs = [e.get('src') for e in elems]
    return imgSrcs
Пример #8
0
'''Shows all masculin first names from french website'''

from scraptools import getElementsFromUrl, urlIterator

startUrl = 'http://www.quelprenom.com/prenom-garcon.php'
nextCssSelector = 'span.button-right'

#Iterate on all the pages successively
for href in urlIterator(startUrl, nextCssSelector):
    #Iterate on all the names of a page
    for nameTag in getElementsFromUrl(href, '.prenom-lien'):
        print nameTag.text
def getHTMLContent(url):
    '''Get html code of main content of a url on Oreilly'''
    elems = scraptools.getElementsFromUrl(url, '#content')
    content = elems[0]
    return etree.tostring(content, pretty_print=True, method="html")