def test_product_card_location(self): response = self.client.get("/datadocument/179486/") html = response.content.decode("utf-8") e_idx = html.index('id="extracted-text-title"') p_idx = html.index('id="product-title"') self.assertTrue(p_idx < e_idx, ("Product card should come before " "Extracted Text card"))
def test_product_card_location(self): response = self.client.get('/datadocument/179486/') html = response.content.decode('utf-8') e_idx = html.index('<h4>Extracted Text</h4>') p_idx = html.index('<h4 class="d-inline">Products</h4>') self.assertTrue(p_idx > e_idx, ('Product card should come after ' 'Extracted Text card'))
def block_html(self, html): if self.texoid and html.startswith('<latex'): attr = html[6:html.index('>')] latex = html[html.index('>') + 1:html.rindex('<')] latex = self.parser.unescape(latex) result = self.texoid.get_result(latex) if not result: return '<pre>%s</pre>' % mistune.escape(latex, smart_amp=False) elif 'error' not in result: img = ('''<img src="%(svg)s" onerror="this.src='%(png)s';this.onerror=null"''' 'width="%(width)s" height="%(height)s"%(tail)s>') % { 'svg': result['svg'], 'png': result['png'], 'width': result['meta']['width'], 'height': result['meta']['height'], 'tail': ' /' if self.options.get('use_xhtml') else '' } style = ['max-width: 100%', 'height: %s' % result['meta']['height'], 'max-height: %s' % result['meta']['height'], 'width: %s' % result['meta']['height']] if 'inline' in attr: tag = 'span' else: tag = 'div' style += ['text-align: center'] return '<%s style="%s">%s</%s>' % (tag, ';'.join(style), img, tag) else: return '<pre>%s</pre>' % mistune.escape(result['error'], smart_amp=False) return super(AwesomeRenderer, self).block_html(html)
def find_link(html, start_index): pivot_index = html.index(Config.AMAZON_SEARCH_PIVOT, start_index) href = "href=\"" link_start = html.index(href, pivot_index) + len(href) link_end = html.index("\"", link_start) link = html[link_start:link_end] result = LinkResult(link, link_end) return result
def getCalories(html): try: string1 = "Calories " string2 = "</b>" start = html.index(string1) + len(string1) end = html.index(string2,start) return int(html[start:end]) except: return -1
def getCalories(html): try: string1 = "Calories " string2 = "</b>" start = html.index(string1) + len(string1) end = html.index(string2, start) return int(html[start:end]) except: return -1
def get_data(self, url): req = requests.get(url=url, headers=self.headers) req.encoding = 'utf-8' html = req.text html = html[html.index("(") + 1:] html = html[:html.index(")"):] try: product_dic = demjson.decode(html) return product_dic except Exception: return ""
def is_amazon_seller(html): seller_index = html.index(Config.AMAZON_OTHERSELLERS_SELLERINFO_PIVOT) seller_end = html.index(Config.AMAZON_OTHERSELLERS_SELLERINFO_END_PIVOT, seller_index) try: amazon_index = html.index(Config.AMAZON_OTHERSELLERS_PROPRIETARY_TAG, seller_index) if amazon_index < seller_end: return True else: return False except: return False
def test_ingredient_rank(self): doc = DataDocument.objects.get(pk=254643) qs = doc.extractedtext.rawchem.select_subclasses() one = qs.first() two = qs.last() self.assertTrue(two.ingredient_rank > one.ingredient_rank) response = self.client.get(f"/datadocument/{doc.pk}/") html = response.content.decode("utf-8") first_idx = html.index(f'id="chem-{one.pk}"') second_idx = html.index(f'id="chem-{two.pk}"') self.assertTrue( second_idx > first_idx, ("Ingredient rank 1 comes before " "Ingredient rank 2"), )
def scrapeGame(url): html = scraperwiki.scrape(url+"/games?xml=1") root = lxml.html.fromstring(html) sindex = html.index("gamesList")+10 eindex = html.rindex("gamesList") bodycontent = html[sindex:eindex] sindex = bodycontent.index("games")+6 eindex = bodycontent.rindex("games")-2 rows = bodycontent[sindex:eindex] start = rows.index("<game>") end = rows.index("</game>") glist = rows.split("<game>") gamestats = [] for g in glist: start = g.find("<name>") end = g.find("</name>") start2 = g.find("<hoursOnRecord>") end2 = g.find("</hoursOnRecord>") if start!=-1 and end!=-1: name = g[start+6:end] game = name[9:len(name)-3] hours = 0.0 if start2==-1 or end2==-1: hours = 0.0 else: hours = float(g[start2+15:end2]) stat = {"game":game,"hours":hours} gamestats.append(stat) return gamestats
def get_bestseller_rank(resp): html = resp.content if "Robot Check" in html: raise EnvironmentError("CAPTCHA :(") pivot = html.index(Config.AMAZON_SELLERRANK_PIVOT) numb_start = html.rfind("#", 0, pivot) + 1 numb_end = html.find(" ", numb_start) rank = html[numb_start:numb_end] return rank
def get_othersellers_lowest_prices(resp): html = resp.content if "Robot Check" in html: raise EnvironmentError("CAPTCHA :(") if is_amazon_seller(html): return [-1, -1, -1] prices = [0, 0, 0] #main, shipping, tax offer_start = html.index("a-spacing-mini olpOffer") offer_end = html.index("</p>", offer_start) try: mainprice_index = html.index( Config.AMAZON_OTHERSELLERS_OFFERPRICE_PIVOT, offer_start) if mainprice_index > offer_end: print("mainprice_index wastoo big.") return prices prices[0] = get_price(html, mainprice_index) except Exception as e: print("Error in mainprice_index: " + e.message) try: shippingprice_index = html.index( Config.AMAZON_OTHERSELLERS_SHIPPINGPRICE_PIVOT, offer_start) if shippingprice_index < offer_end: prices[1] = get_price(html, shippingprice_index) except Exception as e: print("Error in shippingprice_index: " + e.message) try: taxprice_index = html.index(Config.AMAZON_OTHERSELLERS_TAXPRICE_PIVOT, offer_start) if taxprice_index < offer_end: prices[2] = get_price(html, taxprice_index) except Exception as e: print("Error in taxprice_index: " + e.message) return prices
def get_main_table_from_file(f, year): """Return the ``mainTable`` table from file ``f`` for year ``year`` while taking care of all the obfuscation and garbage. """ if year <= 2011: if year <= 2009: html = f.read() else: # Siveco's HTML inside HTML for line in f: html = get_inner_html(line) if html: break else: return None html = html[html.index('<HTML>'):] # remove leading garbage else: # finally a plain HTML html = f.read() return get_main_table(html, year)
def get_blue_peter_content (url): try: today = datetime.date.today() html = urlopen(url).read() btn = html.index ("btnlounaslista.gif") end = html[:btn].rindex ("pdf") + 3 start = html[:end].rindex ("http://") pdf_url = html[start:end] pdf_raw_data = urlopen(pdf_url).read() pdf = pyPdf.PdfFileReader(io.BytesIO(pdf_raw_data)) pdf_text = pdf.pages[0].extractText() pdf_text = re.sub("Liikelounasmenu ", '', pdf_text) delimiters = "MAANANTAI|TIISTAI|KESKIVIIKKO|TORSTAI|PERJANTAI|VL =" return [re.split(delimiters, pdf_text)[datetime.date.weekday(today) + 1]] except ValueError: print "Failed to find Blue peter pdf"; return []
def connect(self): """ connect to onleihe website emulating webbrowser - cookies added to requests session """ response = self.session.get(STARTURL) LOGGER.debug("got cookies: %s" % self.session.cookies) # a RequestsCookieJar LOGGER.debug("JSESSIONID='%s'" % (response.cookies.get('JSESSIONID'))) assert self.session.cookies.get('JSESSIONID') == response.cookies.get( 'JSESSIONID') html = response.content.decode('utf-8') #if not '<title>die OnleiheRegio. Startseite</title>' in html: if not 'title="die OnleiheRegio"' in html or 'An unexpected error has occurred!' in html: open('onliehe_start_bad.html', 'w').write(html) raise ValueError("unexpected response from onleihe url=%s." % STARTURL) # <section id="simple-search"> # <h3>Einfache Suche</h3> if html.index('<section id="simple-search">') < 0: open('onliehe_missing_search.html', 'w').write(html) raise ValueError("missing search on onleihe page url=%s." % STARTURL) return
def get_blue_peter_content(url): try: today = datetime.date.today() html = urlopen(url).read() btn = html.index("btnlounaslista.gif") end = html[:btn].rindex("pdf") + 3 start = html[:end].rindex("http://") pdf_url = html[start:end] pdf_raw_data = urlopen(pdf_url).read() pdf = pyPdf.PdfFileReader(io.BytesIO(pdf_raw_data)) pdf_text = pdf.pages[0].extractText() pdf_text = re.sub("Liikelounasmenu ", '', pdf_text) delimiters = "MAANANTAI|TIISTAI|KESKIVIIKKO|TORSTAI|PERJANTAI|VL =" return [ re.split(delimiters, pdf_text)[datetime.date.weekday(today) + 1] ] except ValueError: print "Failed to find Blue peter pdf" return []
i = 0; for el in root.cssselect("span.surveyNumber"): if el.text[len(el.text) - 1] == "%": if i == 0: bullish = el.text.strip().replace("%", "") if i == 1: neutral = el.text.strip().replace("%", "") if i == 2: bearish = el.text.strip().replace("%", "") i = i + 1 print bullish print neutral print bearish index = html.index("Week ending") print index date = html[index + 11:index + 21].strip() print date date_object = datetime.strptime(date, '%m/%d/%Y') scraperwiki.sqlite.save(unique_keys=["survey_date", "mood"], data={"survey_date":date_object, "mood":"bullish", "survey_percentage":bullish}) scraperwiki.sqlite.save(unique_keys=["survey_date", "mood"], data={"survey_date":date_object, "mood":"neutral", "survey_percentage":neutral}) scraperwiki.sqlite.save(unique_keys=["survey_date", "mood"], data={"survey_date":date_object, "mood":"bearish", "survey_percentage":bearish}) import scraperwiki import lxml.html import datetime from datetime import datetime
def get_price(html, pivot_index): dollar_index = html.index("$", pivot_index) ending_index = html.index("</span>", pivot_index) if dollar_index > ending_index: return 0 return parse_price(html, dollar_index)
def get_mainTable(html): html = html[html.index('<HTML>'):] html = unicode(html, 'utf-8') doc = lxml.html.fromstring(html) main_table = doc.xpath(r'''//table[@id="mainTable"]''')[0] return main_table