예제 #1
0
def printUnitIdInfo(unit_id):
    '''Prints all the info associated with the unit_id if any'''
    href = 'http://lenordelec.ca/modules/imdirectory/unit.php?unit_id=%d' % unit_id
    source = getUrlContent(href)
    html = etree.HTML(source)

    selTitle = CSSSelector('div.imdirectory_unit_container_info h1')
    title = selTitle(html)
    
    if len(title):  # Check if page has info
        print 'unit_ID\t{}'.format(unit_id)
        print 'Nom locataire\t{}'.format(title[0].text)
        selInfos = CSSSelector('div.unit_info')
        selTitle = CSSSelector('div.title')
        selValue = CSSSelector('div.value')
        infos = selInfos(html)
        
        # Iterate on all data fields
        for info in infos:
            title = selTitle(info)[0]
            value = selValue(info)[0]
            try:
                valueText = etree.tostring(value, method="text", with_tail=False)
                valueText = re.sub('[\t\r\n]', '', valueText).rstrip()
            except Exception as e:
                valueText = str(e)
                
            print  '{}\t{}'.format(title.text, valueText)
        print
예제 #2
0
def getPics(subject, destPath='', nbPages=1, startPage=0):
    '''Downloads images on subject, to folder destPath'''
    for pageNb in xrange(startPage, startPage+nbPages):
        print 'Page', pageNb
        url = getImgSearchUrl(subject, pageNb)
        response = getUrlContent(url)
        if response:
            jsonInfo = json.loads(response)
            
            if jsonInfo and jsonInfo['responseData']:
                results = jsonInfo['responseData']['results']
                
                for res in results:
                    #print res['titleNoFormatting']
                    #print res['contentNoFormatting']
                    #print res['width'], res['height']
                    print res['url']
                    downloadRessource(res['url'], destPath)
예제 #3
0
def get_whois_html(url):
    base_url = 'http://www.whois.com/whois/'
    query = url.replace('http://www.', '').replace('https://www.', '')
    search_url = base_url + query
    html = scraptools.getUrlContent(search_url)
    return html