def printUnitIdInfo(unit_id): '''Prints all the info associated with the unit_id if any''' href = 'http://lenordelec.ca/modules/imdirectory/unit.php?unit_id=%d' % unit_id source = getUrlContent(href) html = etree.HTML(source) selTitle = CSSSelector('div.imdirectory_unit_container_info h1') title = selTitle(html) if len(title): # Check if page has info print 'unit_ID\t{}'.format(unit_id) print 'Nom locataire\t{}'.format(title[0].text) selInfos = CSSSelector('div.unit_info') selTitle = CSSSelector('div.title') selValue = CSSSelector('div.value') infos = selInfos(html) # Iterate on all data fields for info in infos: title = selTitle(info)[0] value = selValue(info)[0] try: valueText = etree.tostring(value, method="text", with_tail=False) valueText = re.sub('[\t\r\n]', '', valueText).rstrip() except Exception as e: valueText = str(e) print '{}\t{}'.format(title.text, valueText) print
def getPics(subject, destPath='', nbPages=1, startPage=0): '''Downloads images on subject, to folder destPath''' for pageNb in xrange(startPage, startPage+nbPages): print 'Page', pageNb url = getImgSearchUrl(subject, pageNb) response = getUrlContent(url) if response: jsonInfo = json.loads(response) if jsonInfo and jsonInfo['responseData']: results = jsonInfo['responseData']['results'] for res in results: #print res['titleNoFormatting'] #print res['contentNoFormatting'] #print res['width'], res['height'] print res['url'] downloadRessource(res['url'], destPath)
def get_whois_html(url): base_url = 'http://www.whois.com/whois/' query = url.replace('http://www.', '').replace('https://www.', '') search_url = base_url + query html = scraptools.getUrlContent(search_url) return html