def fetchProvince(id): site = "https://lobbycanada.gc.ca" uri = "/app/secure/ocl/lrs/do/cmmLgPblcVw?comlogId=" + str(id) page = requests.get(site + uri) tree = html.fromstring(page.content) table = tree.xpath('//table[@class="table"]')[0] uri = re.search(b'(?<=<a href=\")(.*)(?=\" target)', htmlstring(table)).groups(1)[0].decode("utf-8") page = requests.get(site + uri) tree = html.fromstring(page.content) uri = tree.get_element_by_id("regId").xpath('//option')[0].get("value") page = requests.get((site + uri).replace("#regStart", "") + "#indirect") tree = html.fromstring(page.content) addressHtml = tree.get_element_by_id("indirect").xpath( '//div[@class="col-sm-5"]')[0] province = re.search(b'(?<=\\n)[^\\n]*(?=,)[^;]*;([A-Z][A-Z0-9]*)', htmlstring(addressHtml)).groups(1)[0].decode("utf-8") return province
def parse_booking_hotel_page(url): ''' Receive an URL corresponding to a hotel webpage, parse informations about the hotel and return a dictionary with these informations ''' # get html try: content = requests.get(url.replace("en-gb", "en-us"), headers={'Accept-Encoding': 'identity'}).text #US to get prices in $$$ log_influxdb("HOTELS") except: return ("#", float(0), float(0), float(0), '', [], '', '', {}, 'N/A') try: dom = LH.fromstring(content) # get latitude latitude = re.findall( 'booking.env.b_map_center_latitude = ([-\.\d]+)', content) latitude = latitude[0] if len(latitude) > 0 else -1 # get longitude longitude = re.findall( 'booking.env.b_map_center_longitude = ([-\.\d]+)', content) longitude = longitude[0] if len(longitude) > 0 else -1 # get the rate tmp_rate = css_select( dom, 'span.average, span.js--hp-scorecard-scoreval, \ [itemprop="ratingValue"]') rate = tmp_rate[0].text if len(tmp_rate) > 1 else -1 # get the address address = css_select(dom, 'span.hp_address_subtitle') # get images link #pictures = css_select(dom, 'div#photos_distinct a') #pictures = [result.get('href').replace("max400", "max1024x768") for result in pictures] pictures = re.findall(r'https:\/\/.+\.bstatic\.com\/images\/hotel\/max1024x768\/.+\.jpg', content) pictures = list(set(pictures)) # get price price = re.findall(r'start at U*S*(.+?) .+', content) if (len(price) >= 1): price = "From "+price[0] else: price = "N/A" # get description description = css_select(dom, "div.hotel_description_wrapper_exp") description = htmlstring(description[0]) description = re.sub("<[^>]*>", "", description) # get reviews url reviews_url = css_select(dom, 'a.show_all_reviews_btn') reviews_url = [result.get('href') for result in reviews_url] reviews_url = BOOKING_URL_PREFIX + reviews_url[0] # get reviews reviews = get_grades(reviews_url) if len(address) >= 1: return (url, float(latitude), float(longitude), float(rate), address[0].text, pictures, description, reviews_url, reviews, price) else: return (url, float(latitude), float(longitude), float(rate), '', pictures, description, reviews_url, reviews, price) except: return ("#", float(0), float(0), float(0), '', [], '', '', dict(hotel_clean=float(-4)), 'N/A')
def render_template(self, template, values=None, engine='ir.qweb'): res = super(IrUiView, self).render_template(template, values, engine) res_copy = res try: website = request.website except: website = False optimiser = website and self.env['optimiser.optimiser'].sudo().search( [('website_id', '=', website.id)]) try: res = res.decode("utf-8", "ignore").encode("ascii", "xmlcharrefreplace") except: pass if values and values.get('request', False) and optimiser: res = fromstring(res) head = res.find('.//head') body = res.find('.//body') no_head_body = False if not body or not head: no_head_body = True if not no_head_body: if not request.httprequest.is_xhr: if optimiser.load_css_async or optimiser.css_bottom: styles = res.cssselect('link[rel="stylesheet"]') ie_styles = "" for style in styles: ie_styles += htmlstring( style, method="html").decode("utf-8").strip( ).strip("\n").rstrip('>') + "/>" if optimiser.css_bottom: body.insert(len(body), style) if optimiser.load_css_async: noscript_tag = Element('noscript') tmp_style = copy.copy(style) noscript_tag.insert(0, tmp_style) parent = style.getparent() parent.insert( parent.index(style) + 1, noscript_tag) style.attrib['rel'] = 'preload' style.attrib['as'] = 'style' style.attrib[ 'onload'] = "this.onload=null;this.rel='stylesheet'" script_tag_for_converting_styles = Element("script") script_tag_for_converting_styles.attrib[ 'data-not-touchable'] = 'true' script_tag_for_converting_styles.text = "function supportsToken(token){return function(relList){if(relList && relList.supports && token){return relList.supports(token)} return false}}; window.onload = function(){if(!supportsToken('preload')(document.createElement('link').relList)){var links=document.querySelectorAll('link[as=\"style\"][rel=\"preload\"]'); if(links.length){for(var i in links){links[i].rel='stylesheet'}}}}" body.insert(len(body), script_tag_for_converting_styles) script_tag_for_checking_ie = Element('script') script_tag_for_checking_ie.attrib[ 'data-not-touchable'] = 'true' script_tag_for_checking_ie.text = "function isIE(){var myNav=navigator.userAgent.toLowerCase(); return (myNav.indexOf('msie') != -1 || myNav.indexOf('trident') != -1) ? true : false;}; if(isIE()){var div=document.createElement('div');div.innerHTML='%s';document.head.appendChild(div);}" % ie_styles head.insert(len(head), script_tag_for_checking_ie) if optimiser.js_bottom: scripts = res.cssselect( 'script:not([data-not-touchable])') for script in scripts: body.insert(len(body), script) if optimiser.load_js_async: scripts = res.cssselect('script[src]') lazy_scripts = res.cssselect('script[data-src]') optimiser_js_async_setting = optimiser.load_js_async if "shop/payment" in request.httprequest.path: optimiser_js_async_setting = 'async' for script in scripts: if optimiser_js_async_setting == 'async': script.attrib['defer'] = 'defer' else: script.attrib[ 'data-optimiser-src'] = script.attrib[ 'src'] script.attrib.pop("src", None) for script in lazy_scripts: if optimiser_js_async_setting == 'async': script.attrib['defer'] = 'defer' script.attrib['src'] = script.attrib[ 'data-src'] else: script.attrib[ 'data-optimiser-src'] = script.attrib[ 'data-src'] script.attrib.pop("data-src", None) if optimiser_js_async_setting == 'sync_lazy': load_lazy_scripts = Element("script") load_lazy_scripts.text = """function loadScripts() { var scripts = Array.from(document.querySelectorAll("script[data-optimiser-src]")); sessionStorage.setItem('secondTimeLoad', '1'); function loadScript(scripts) { if(scripts.length){ var attr = scripts[0].getAttribute("data-optimiser-src"); scripts[0].setAttribute("src", attr); scripts[0].removeAttribute("data-optimiser-src"); scripts[0].onload = function () { scripts.shift(); loadScript(scripts); } } } loadScript(scripts) } window.addEventListener("scroll", function scrollEventFunction() { setTimeout(function(){loadScripts()},500) },{once: true}) window.addEventListener("load", function () { var timer = sessionStorage.getItem('secondTimeLoad') ? 0 : 1500; setTimeout(function () { loadScripts() }, timer); })""" body.insert(len(body), load_lazy_scripts) if optimiser.page_loading: page_loader_script_tag = Element("script") page_loader_script_tag.text = "window.addEventListener('" + optimiser.show_page_loading_until + \ "', function(){document.querySelector('div.optimiser-page-loader').remove();});" page_loader_image_width = optimiser.page_loading_image_width if optimiser.page_loading_image_width else "100px" page_loader_image_height = optimiser.page_loading_image_height if optimiser.page_loading_image_height else "100px" page_loader_image_position_top = optimiser.page_loading_image_pos_top if optimiser.page_loading_image_pos_top else "50%" page_loader_image_position_left = optimiser.page_loading_image_pos_left if optimiser.page_loading_image_pos_left else "50%" page_loader_bg = optimiser.page_loading_bg_color if optimiser.page_loading_bg_color else "#FFFFFF" page_loader_bg_transparency = optimiser.page_loading_bg_transparency if optimiser.page_loading_bg_transparency else 1 page_loader_bg_image = (optimiser.show_default_page_loading_image or optimiser.page_loading_image) \ and "background-image: url(/optimiser-page-loader-image);" or "" page_loader_div = Element( "div", **{ 'class': "optimiser-page-loader", 'style': "position: fixed;" "left: 0;" "top: 0;" "width: 100%%;" "height: 100%%;" "z-index: 9999999999;" "%s" "background-repeat: no-repeat;" "background-size: %s %s;" "background-color: rgba%s;" "background-position: %s %s;" "background-attachment: fixed;" % (str(page_loader_bg_image), str(page_loader_image_width), str(page_loader_image_height), str( hex2rgb(page_loader_bg, page_loader_bg_transparency)), str(page_loader_image_position_top), str(page_loader_image_position_left)) }) body.insert(len(body), page_loader_script_tag) body.insert(0, page_loader_div) if len(optimiser.custom_content_ids) > 0: contents = optimiser.custom_content_ids for content in contents: if content.content: try: tmp = fromstring(content.content) except: continue if content.position.startswith('head'): html = head else: html = body position = len( html) if content.position.endswith( "end") else 0 head_content = tmp.find('.//head') if head_content: for tmp_content in head_content: html.insert(position, tmp_content) position += 1 else: html.insert(position, tmp) if optimiser.preload_fonts and len( optimiser.preload_fonts_ids) > 0: for font in optimiser.preload_fonts_ids: preload_font_elem = Element('link') preload_font_elem.attrib['rel'] = "preload" preload_font_elem.attrib['href'] = font.path preload_font_elem.attrib['as'] = "font" preload_font_elem.attrib['crossorigin'] = "" head.insert(1, preload_font_elem) if optimiser.enable_lazy_load_front: images = res.cssselect('img:not(.og_not_lazy)') bg_images = res.cssselect( '[style*="background-image"]:not(.optimiser-page-loader):not(.og_not_lazy)' ) loading_image = ((optimiser.show_default_image_loading_image or optimiser.loading_image) and "/optimiser-loading") \ or "/optimiser/static/src/img/empty.png" check_class_regex = re.compile( r"^.*\s*optimiser_lazy(\s+|$)") if not request.httprequest.is_xhr: lazy_loader_style = Element('style') lazy_loader_style.text = 'img[src="/optimiser-loading"]{width:40px!important;height:40px!important;text-align:center;margin:auto;-o-object-fit:contain!important;object-fit:contain!important}' head.insert(len(head), lazy_loader_style) for bg_img in bg_images: bg_style = bg_img.attrib['style'] find_bg_image = "background-image:" try: bg_image_index = bg_style.index(find_bg_image) except: continue index_of_bg_image_start = bg_image_index + len( find_bg_image) try: bg_style.index('url', index_of_bg_image_start) except: continue try: index_of_bg_image_end = bg_style.index( ';', index_of_bg_image_start) except: try: bg_style += ';' index_of_bg_image_end = bg_style.index( ';', index_of_bg_image_start) except: continue important_exists = '' try: important_exists = bg_style[ index_of_bg_image_start: index_of_bg_image_end].index('!important') except: pass start_of_string = bg_style[:bg_style. index(find_bg_image)] end_of_string = bg_style[index_of_bg_image_end + 1:] if important_exists != '': url_with_important_exist = bg_style[ index_of_bg_image_start: index_of_bg_image_end].strip().rstrip( '!important').strip() important_exists = '!important' main_image_url = url_with_important_exist.strip( ).lstrip('url(').rstrip(')').strip("'").strip('"') else: main_image_url = bg_style[ bg_style.index('url', index_of_bg_image_start) + 3:index_of_bg_image_end] \ .strip().lstrip('(').rstrip(')').strip("'").strip('"') bg_img.attrib[ 'data-src'] = main_image_url if not 'data-src' in bg_img.attrib else bg_img.attrib[ 'data-src'] bg_img.attrib['class'] = bg_img.attrib.get( 'class', '') if check_class_regex.match( bg_img.attrib.get( 'class', '')) else bg_img.attrib.get( 'class', '') + ' optimiser_lazy' bg_img.attrib['style'] = "background-image: url('" + \ loading_image + "')" + important_exists + ";" + \ start_of_string + \ end_of_string for img in images: img.attrib['data-src'] = img.attrib[ 'src'] if not 'data-src' in img.attrib else img.attrib[ 'data-src'] img.attrib['src'] = loading_image img.attrib['class'] = img.attrib.get( 'class', '') if check_class_regex.match( img.attrib.get( 'class', '')) else img.attrib.get( 'class', '') + ' optimiser_lazy' if optimiser.enable_recaptcha: if optimiser.captcha_selectors: selectors = res.cssselect(','.join( optimiser.captcha_selectors.mapped('name'))) if selectors: captcha_element_parent = Element('div') captcha_element_parent.attrib[ 'class'] = 'form-group field-recaptcha' captcha_element = Element('div') captcha_element.attrib['class'] = 'g-recaptcha' captcha_element.attrib[ 'data-sitekey'] = optimiser.captcha_site_key captcha_element_parent.insert(0, captcha_element) for element in selectors: insert_element = None for i in reversed(element.getchildren()): if i.tag == 'div': insert_element = i break element.insert(element.index(insert_element), captcha_element_parent) if not request.httprequest.is_xhr: script_tag_for_recaptcha = Element("script") script_tag_for_recaptcha.attrib[ 'src'] = 'https://www.google.com/recaptcha/api.js' script_tag_for_recaptcha.attrib[ 'async'] = 'async' script_tag_for_recaptcha.attrib[ 'defer'] = 'defer' body.insert(len(body), script_tag_for_recaptcha) doctype = None if '/slides/embed' in request.httprequest.url else '<!DOCTYPE html>' res = htmlstring(res, method="html", doctype=doctype) else: res = res_copy if optimiser.compress_html: res = htmlmin.minify(res.decode("utf-8"), remove_empty_space=True, remove_comments=True) try: res = res.decode("utf-8") except: pass return res
def getSongTag(songId, albumId): logger.debug("songId : %s" , songId) logger.debug("albumId : %s" , albumId) allTag = {} url = 'https://m.app.melon.com/song/detail.htm?songId=' url = '%s%s' % (url, urllib.parse.quote(songId)) data = LogicNormal.get_html(url) tree = html.fromstring(data) #제목 try: h1 = tree.xpath('/html/body/div[1]/article/div[2]/div/h1')[0] title = h1.text.strip() allTag['title'] = title except Exception as e: allTag['title'] = "" #logger.debug( "제목 : " + title ) #아티스트 try: artist = "" p = tree.xpath('/html/body/div[1]/article/div[2]/div/p')[0] artist = p.text.strip() allTag['artist'] = artist except Exception as e: allTag['artist'] = "" #logger.debug( "아티스트 : " + artist ) #장르 try: span = tree.xpath('/html/body/div[1]/article/div[2]/ul/li[1]/span[2]')[0] genre = span.text.strip() allTag['genre'] = genre except Exception as e: allTag['genre'] = "" #logger.debug( "장르 : " + genre ) url = 'https://m.app.melon.com/album/music.htm?albumId=' url = '%s%s' % (url, urllib.parse.quote(albumId)) data = LogicNormal.get_html(url) tree = html.fromstring(data) p = tree.xpath('/html/body/section/div[2]/div[1]/div/div[2]/p[2]') #제작년도 try: year = p[0].text[:4] allTag['year'] = year except Exception as e: allTag['year'] = "" #logger.debug( "제작년도 : " + year ) #트랙 try: track = "00" lis = tree.xpath('/html/body/div[1]/article/div[2]/ul/li') from lxml.etree import tostring as htmlstring logger.debug("lis : %d" , len(lis)) if len(lis) == 1: p = tree.xpath('/html/body/div[1]/article/div[2]/ul/li/div[2]/div/a/p')[0] pHtml = p.text_content().strip() pHtml = pHtml.replace('타이틀',"") p = pHtml.strip() if p == title: div = tree.xpath('/html/body/div[1]/article/div[2]/ul/li/div[1]')[0] track = div.text_content().strip() else: for i in range(0, len(lis)): cnt = i + 1 logger.debug("i : %d", i) p = tree.xpath('/html/body/div[1]/article/div[2]/ul/li[%s]/div[2]/div/a/p' % cnt)[0] span = tree.xpath('/html/body/div[1]/article/div[2]/ul/li[%s]/div[2]/div/a/p/span' % cnt) if len(span) == 1: pHtml = p.text_content().strip() pHtml = pHtml.replace('타이틀',"") p = pHtml.strip() else: p = p.text.strip() if p == title: div = tree.xpath('/html/body/div[1]/article/div[2]/ul/li[%s]/div[1]' % cnt)[0] track = div.text_content().strip() allTag['track'] = track except Exception as e: allTag['track'] = "" #logger.debug( "트랙 : " + track ) #앨범이미지 try: albumImage = "" meta = tree.xpath('/html/head/meta[6]')[0] albumImage = meta.attrib.get("content") allTag['albumImage'] = albumImage except Exception as e: allTag['albumImage'] = "" #logger.debug( "앨범이미지 : " + albumImage ) #앨범 try: album = "" p = tree.xpath('/html/body/section/div[2]/div[1]/div/div[2]/p[1]')[0] album = p.text.strip() allTag['album'] = album except Exception as e: allTag['album'] = "" #logger.debug( "앨범 : " + album ) #가사 try: url = 'https://m.app.melon.com/song/lyrics.htm?songId=' url = '%s%s' % (url, urllib.parse.quote(songId)) data = LogicNormal.get_html(url) tree = html.fromstring(data) div = tree.xpath('/html/body/div[1]/article/div[2]/div[2]')[0] lyrics = htmlstring(div, encoding='utf8') lyrics = lyrics.replace('<div class="lyrics">',"") lyrics = lyrics.replace(" ","") lyrics = lyrics.replace("</div>","") lyrics = lyrics.replace("<br/>","\n").strip() allTag['lyrics'] = lyrics except Exception as e: allTag['lyrics'] = "" #logger.debug( "가사 : " + lyrics ) return allTag
def movie_details(url): ''' Extracts movie info from URL ''' try: content = requests.get(url, headers={ 'Accept-Encoding': 'identity' }).text dom = LH.fromstring(content) status = "RELEASED" rank = -1 try: rank = float(re.findall(r'\d+', url)[0]) except: log_influxdb("COULDNT_RANK") rank = -1.0 try: length = re.findall('\(([0-9]+)h ([0-9]+)min\)', content) hours = float(length[0][0]) minutes = float(length[0][1]) except: status = "UNRELEASED" hours = minutes = float(-1) log_influxdb("UNRELEASED_MOVIES") try: score_press = float( css_select( ".rating-holder .rating-item:nth-child(1) .stareval-note", dom)[0].text.replace(",", ".")) reviews_press = float( css_select( ".rating-holder .rating-item:nth-child(1) .stareval-review", dom)[0].text) except: score_press = reviews_press = float(-1) try: score_viewers = float( css_select( ".rating-holder .rating-item:nth-child(2) .stareval-note", dom)[0].text.replace(",", ".")) reviews_viewers = float( css_select( '.rating-holder .rating-item:nth-child(2) .stareval-review [itemprop="ratingCount"]', dom)[0].text) except: score_viewers = reviews_viewers = float(-1) try: date = to_ascii(css_select(".date.blue-link", dom)[0].text) except: status = "NO RELEASE DATE" date = "" log_influxdb("NO RELEASE DATE") try: synopsis = to_ascii(css_select(".synopsis-txt", dom)[0].text) except: status = "NO SYNOPSIS" synopsis = "" log_influxdb("NO SYNOPSIS") title = to_ascii( css_select(".titlebar-title.titlebar-title-lg", dom)[0].text) cover = to_ascii( css_select(".card-movie-overview .thumbnail-img", dom)[0].get("src")) director = to_ascii( css_select('[itemprop="director"] [itemprop="name"]', dom)[0].text) genre = [ to_ascii(result.text) for result in css_select('[itemprop="genre"]', dom) ] nationalities = [ to_ascii(result.text) for result in css_select(".blue-link.nationality", dom) ] pictures = [ to_ascii(result.get("data-src")) for result in css_select(".shot-img", dom) ] actors = [ to_ascii(result.text) for result in css_select( ".card-movie-overview .meta-body .meta-body-item:nth-child(3) span.blue-link:not(.more)", dom) ] misc = to_ascii(htmlstring(css_select(".ovw-synopsis-info", dom)[0])) try: trailer = ALLOCINE_URL_PREFIX + css_select( ".trailer", dom)[0].get("href").replace("&", "&") trailer = requests.get(trailer, headers={ 'Accept-Encoding': 'identity' }).text trailer_hd = re.findall( "([\.\\\/0-9a-zA-Z_]+hd[\\/0-9a-zA-Z_]+\.mp4)", trailer) if len(trailer_hd): trailer = to_ascii(u'http:' + stripslashes(trailer_hd[0])) else: trailer = to_ascii(u'http:' + stripslashes( re.findall("([\.\\\/0-9a-zA-Z_]+[^k]\.mp4)", trailer)[0])) except: trailer = "" status = "MISSING TRAILER" log_influxdb("FAILED_TRAILERS") log_influxdb("MOVIES") return [ status, hours, minutes, title, date, cover, director, genre, nationalities, score_press, reviews_press, score_viewers, reviews_viewers, pictures, actors, synopsis, misc, trailer, rank ] except Exception as e: log_influxdb("FAILED_MOVIES") return []