def grab_entry(url): doc = scraper.get(url).html() art = doc.find('.//div[@class="article"]') meta = { 'html': html.tostring(art), 'info_url': url } meta['person'] = art.findtext('./h3/span') for item in art.findall('.//li'): if 'download' in item.get('class', ''): doc_url = item.find('.//a').get('href') meta['source_url'] = urljoin(url, doc_url) continue label = item.findtext('./label') if label is not None: label = label.strip().lower().replace(' ', '_') content = item.find('./span') if content is None: continue content = html.tostring(content).split('>', 1)[-1].rsplit('<', 1)[0] if 'gifts' in item.get('class', ''): items = map(clean, content.split('<br>')) meta[label] = filter(lambda s: len(s), items) else: meta[label] = clean(content) if 'pdf' in meta.get('source_url', ''): print meta['source_url'] collection.ingest(meta.get('source_url'), **meta)
def prepare_updated_content(self,slug,rendered_content): """Given a slug, get its current content. Either insert the HTML string 'rendered_content' as the first child of the current content for the slug, or, if an existing element with the same tag and id exists, replace that element. Return the title and current content as a tuple of strings.""" content_item = self.p2p_get_content_item(slug) current_content = content_item['body'] title = content_item['title'] parsed = fromstring(current_content) container = parsed.find('.//div[@id="layercake-items"]') if container is None: container = parsed.makeelement('div',{'id': 'layercake-items'}) made_container = True else: made_container = False new_parsed = fromstring(rendered_content) try: existing = container.find("%s[@id='%s']" % (new_parsed.tag,new_parsed.attrib['id'])) except KeyError: existing = None if existing is not None: existing.addnext(new_parsed) container.remove(existing) else: container.insert(0,new_parsed) # TODO: consider timestamping the CSS URL if made_container: new_content = tostring(container) return title,current_content + new_content return title,tostring(parsed)
def test_detokenize_single(self): src_tree = self._load() orig_src_tree = deepcopy(src_tree) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) new_tree = tokenizer.cleanup_tree(src_tree) self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) self.assertHtmlTreeEqual( new_tree, html_document_fromstring(UNANNOTATED_HTML) ) html_tokens, _ = tokenizer.tokenize_single(new_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) self.assertHtmlTreeEqual( detokenized_tree, html_document_fromstring(ANNOTATED_HTML) ) self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree) self.assertHtmlTreeEqual(detokenized_tree, src_tree)
def get_items_from_page_num(self, num): url = self.WISHLIST_PAGE_TEMPLATE.format( wishlist_id=self.wishlist_id, page_number=num, ) _LOG.debug("Fetch from: %s", url) wishlist_page = requests.get(url) wishlist_page_html = wishlist_page.text _PLAIN_ERROR_LOGGER.debug(wishlist_page_html) tree = html.fromstring(wishlist_page_html) all_h5_nodes = tree.xpath("//div[@class='a-row a-size-small']/h5") items = [] for h5_node in all_h5_nodes: try: item = self._get_item_from_idea_h5_node(h5_node) if not item: item = self._get_item_from_amazon_item_h5_node(h5_node) if item: items.append(item) else: _LOG.warn("Fail to retrieve an item for snippet") _PLAIN_ERROR_LOGGER.warn("===== Start of snippet =====") _PLAIN_ERROR_LOGGER.warn(html.tostring(h5_node)) _PLAIN_ERROR_LOGGER.warn("===== End of snippet =====") except ValueError as ex: _LOG.exception("Fail to retrieve an item: %s", ex) _PLAIN_ERROR_LOGGER.warn("===== Start of snippet =====") _PLAIN_ERROR_LOGGER.warn(html.tostring(h5_node)) _PLAIN_ERROR_LOGGER.warn("===== End of snippet =====") return items
def _decorate_article(self, article): """在 parse_response 後執行,後處理其輸出""" # html post-process from lxml.html import tostring, fromstring from bs4 import BeautifulSoup from lib.util.net import normalize_url from lib.util.text import pack_string # article['content'] may be list of lxml doms if type(article['content']) is list: article['content'] = \ fromstring('\n'.join([tostring(x, encoding=unicode) for x in article['content']])) # remove unwanted tags self.css_sel_drop_tree(article['content'], ['script']) # prettify html with BeautifulSoup html_bs4 = BeautifulSoup(tostring(article['content'], encoding=unicode)).body.next article['text'] = pack_string(html_bs4.text) article['html'] = pack_string(unicode(html_bs4)) article["ctlr_classname"] = str(self.__class__) article['url'] = normalize_url(article['url']) article['url_read'] = normalize_url(article['url_read']) article['url_canonical'] = normalize_url(article['url_canonical']) self.move_out_of_meta(article, 'title') return article
def get_services(category, trs): for tr in trs: print tostring(tr) tds = tr.cssselect("td") if len(tds) == 0: continue ahref = tds[0].cssselect("a")[0] link = ahref.attrib["href"] if category == 'Community Services': id = link.split('_')[0] elif category == 'Family Services': id = link.replace('result_detail.asp?externalId=', '') title = ahref.text_content() sub_category = tds[1].text_content() telephone_number = tds[2].text_content() print title, sub_category, telephone_number, link data = { 'id' : id, 'link' : base_url + link, 'title' : title, 'category' : category, 'sub_category' : sub_category, 'telephone_number' : telephone_number, } scraperwiki.sqlite.save(unique_keys=['id'], data=data)
def _read_version_history_html(self, forum_link): br = browser() br.set_handle_gzip(True) try: raw = br.open_novisit(forum_link).read() if not raw: return None except: traceback.print_exc() return None raw = raw.decode('utf-8', errors='replace') root = html.fromstring(raw) spoiler_nodes = root.xpath('//div[@class="smallfont" and strong="Spoiler"]') for spoiler_node in spoiler_nodes: try: if spoiler_node.getprevious() is None: # This is a spoiler node that has been indented using [INDENT] # Need to go up to parent div, then previous node to get header heading_node = spoiler_node.getparent().getprevious() else: # This is a spoiler node after a BR tag from the heading heading_node = spoiler_node.getprevious().getprevious() if heading_node is None: continue if heading_node.text_content().lower().find('version history') != -1: div_node = spoiler_node.xpath('div')[0] text = html.tostring(div_node, method='html', encoding='unicode') return re.sub(r'<div\s.*?>', '<div>', text) except: if DEBUG: prints('======= MobileRead Parse Error =======') traceback.print_exc() prints(html.tostring(spoiler_node)) return None
def get_article(url, mode=None): returnee = {} now = time.localtime() if not mode: agent = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/1A542a Safari/419.3" structure = requests.get(url, headers={"User-Agent": agent}, timeout=5.0) else: structure = mode charset = structure.encoding tree = html.fromstring(structure.text) body = tree.cssselect("div#ct")[0] title = body.cssselect("div.end_tt h2")[0] title.remove(title.cssselect("a")[0]) returnee["title"] = st.refine_text(html.tostring(title), encoding=charset) returnee["name"] = st.refine_text(html.tostring(body.cssselect("div.end_tt p span a")[0]), encoding=charset) date = datetime.datetime.now() try: date = DATE.parse(st.refine_text(html.tostring(body.cssselect("div.end_tt p span.s_tm")[0]), encoding=charset)) except Exception, e: pass
def get_branch_info(self): row={} row['date_scraped']=DATE trs=self.x.xpath('id("Centralcolum3_dtgGroup")/descendant::tr[td/*[self::span or self::strong]]')[:-1] #Skip the junk last row for tr in trs: tds=tr.xpath('td') if len(tds)==1: td=tds[0] if 2==td.xpath('count(span/b/text())'): row['loc1'],row['loc2']=[PostbankBrowser.compact(text) for text in td.xpath('span/b/text()')] else: log(tostring(td)) elif len(tds)==2: cells=tr.xpath('td/*[self::span or self::strong]') key=cells[0].text value=cells[1].text for thing in [key,value]: if thing==None: thing="" else: thing=PostbankBrowser.compact(thing) row[key]=value else: raise self.TableRowError(tostring(tr)) return row
def test_innerhtml(self): from mobilize.components import XPath html_str = '''<table><tr><td>Hello</td></tr></table>''' # test for innerhtml=False component_f = XPath('//td', idname='foo', innerhtml=False) component_f.extract(html.fromstring(html_str)) extracted = component_f.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo"><td>Hello</td></div>' e = normxml(expected) a = normxml(extracted_str) self.assertSequenceEqual(e, a) # test for innerhtml=True component_t = XPath('//td', idname='foo', innerhtml=True) component_t.extract(html.fromstring(html_str)) extracted = component_t.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo">Hello</div>' self.assertSequenceEqual(normxml(expected), normxml(extracted_str)) # test for ineffectiveness of innerhtml=True with multiple matching elements component_t = XPath('//td', idname='foo', innerhtml=True) component_t.extract(html.fromstring(''' <table><tr> <td>Hello</td> <td>Goodbye</td> </tr></table> ''')) extracted = component_t.process() extracted_str = html.tostring(extracted) expected = '<div class="mwu-elem" id="foo"><td>Hello</td><td>Goodbye</td></div>' self.assertSequenceEqual(normxml(expected), normxml(extracted_str))
def business_premises(self, session): s = session html = fromstring(s.get(self.business_premises_url()).content) premises = html.xpath('//table[@width="740" and @cellpadding="5"]') if len(html.xpath('//font[text()="No Business \n Premises Found."]')) == 1: data = [] else: assert 1 == len(premises), tostring(html) trs = premises[0].cssselect('tr') datalists = [[td.text_content() for td in tr.cssselect('td')] for tr in trs] header = [key.replace(' ', '') for key in datalists.pop(0)] data = [dict(zip(header, row)) for row in datalists] for row in data: row.update({ 'date_scraped': DATE, 'businessPremisesURL': self.business_premises_url() }) registrant_data = {} for bodybold in html.cssselect('span.bodybold'): text = bodybold.xpath('following-sibling::span[@class="Text"][position()=1]/text()') assert len(text) == 1, tostring(html) registrant_data['bp_' + bodybold.text.replace(' ', '').replace(':', '')] = text[0].strip() return data, registrant_data
def parse_book_file(href, book): block = {} book_tree = lxml.html.parse(join(books_dir, href), parser) if not 'page_count' in book: td = book_tree.xpath( "//td[descendant::*[contains(text(), '{}')]]".format( book['title']) ) if len(td): td = td[0] page_info = td.xpath("descendant::*[contains(text(), 'страниц')]") if len(page_info): book['page_count'] = patterns[0][1].search( tostring(page_info[0], encoding='unicode')).groups()[0] block['annotation'] = book_tree.xpath( r"//table[descendant::*[contains(text(), 'Аннотация')]]") block['contents'] = book_tree.xpath( r"//table[descendant::*[contains(text(), 'Содержание')]]") for key in block: if len(block[key]): mark = block[key][-1] book[key] = "" for element in mark.itersiblings(): if element.tag == "table": break drop_a(element) remove_attr(element) book[key] += tostring(element, encoding='unicode') book[key] = tidy_fragment(clean(book[key]))[0] return book
def WestBradfordGrill(clas): resp = l2.urlopen(clas.url) data = resp.read() #tappath = '//div[@id="sidebar-left-1"]/div[@id="TextList1"]//li' #doc = lxml.html.fromstring(data) a = lxml.html.fromstring(data) print tostring(a)
def download_user_review(url): try: f = get_url(url) page = html.parse(f) root = page.getroot() if len(root.cssselect("div.error404")) > 0: #print url + " 404'ed" return {} meta = html.tostring(root.cssselect("#player_review div.body div.user_reviews")[0]) #@TODO parse meta if len(root.cssselect("#player_score_details div.body dl.review_details")) > 0: score_details = html.tostring(root.cssselect("#player_score_details div.body dl.review_details")[0]) else: score_details = "No Details" body = html.tostring(root.cssselect("#player_review_body")[0]) ret = {} ret['meta'] = meta ret['score_details'] = score_details ret['body'] = body #@TODO parse body ret['url'] = url return ret #ipdb.set_trace() except: traceback.print_exc() gmail.send("exception!", "*****@*****.**") ipdb.set_trace()
def buildStatesFromTokenizerElement(state_name, html_snippet, sm): print __b(state_name), "in process" if state_name == "tokenizing-character-references": return dom = lhtml.fromstring(html_snippet) switch = dom.cssselect("dl") if not switch: raise Exception("%s does not have <dl> (switch)" % state_name) if len(switch) > 1: print __em("%s have too many <dl> (switch)" % state_name) switch = switch[0] transitions = [] for elmt in switch: if elmt.tag not in ("dt", "dd"): continue if elmt.tag == "dt": dt_text = elmt.text if not dt_text: dt_text = lhtml.tostring(elmt) transitions.append(dt_text) elif elmt.tag == "dd": # We consume the transitions to jump into the # specified state buildSwitchTransitions(state_name, sm, switch=transitions, to=lhtml.tostring(elmt)) transitions = []
def get_meal_info2(): url = "https://zerocater.com/menu/uVGcXhj/" tree = html.fromstring(urllib.urlopen(url).read()) meals = tree.xpath('//div[@class="inherit-height meal-item"]') i = 0 for meal in meals: today = meal.xpath('.//span[@class="meal-is-today label"]/text()') if len(today) > 0: break date = meal.xpath('.//h4[@class="overview-time"]/text()') date_string = get_string(date) date_string = ' '.join(date_string.split()) day = int(date_string[-2:].strip()) print day print date_string today = datetime.utcnow() - timedelta(hours=7) print today.day if today.day <= day: break i = i + 1 meal_today = meals[i] vendor = meal_today.xpath('.//div[@class="overview-wrapper"]')[0] menu = meal_today.xpath('.//ul[@class="list-group swiper-no-swiping"]')[0] data = {} data['overview'] = html.tostring(vendor) data['menu'] = html.tostring(menu) print data return data
def create_html(): """Creates the html for the page.""" table = create_html_table(DISCOVERY_LIST) root = LH.tostring(table) #convert the generated HTML to a string comment = ET.Comment(TITLE) comment = ET.tostring(comment) script = ELEMENT.Script(SCRIPT, type="text/x-mathjax-config") script = LH.tostring(script) script2 = ELEMENT.Script(src=SCRIPT_URL, type="text/javascript") script2 = LH.tostring(script2) script3 = ELEMENT.Script(src=SORT_SCRIPT, type="text/javascript") script3 = LH.tostring(script3) email = ELEMENT.A(EMAIL, href=MAILTO) paragraph = ELEMENT.P(PARAGRAPH, email, ".") date_time = 'Updated ' + time.strftime('%Y-%m-%d %H:%M:%S') date_time = ELEMENT.I(date_time) paragraph.append(date_time) paragraph = LH.tostring(paragraph) paragraph = re.sub(r' \.', '.', paragraph) root = comment + script + script2 + script3 + paragraph + root soup = BS(root) #make BeautifulSoup out = soup.prettify() #prettify the html return out
def html_chenger(li): if type(li) in [str, unicode, int, float]: return li if type(li) is html.HtmlElement: li = html.tostring(li) return li if type(li) is list and len(li) == 1: li = li[0] if type(li) is html.HtmlElement: li = html.tostring(li) return li if li == list(): return '' if type(li) is list: for i, el in enumerate(li): if type(el) in [str, unicode, int, float]: continue elif type(el) is html.HtmlElement: li[i] = html.tostring(el) return '; '.join(li) return None
def _get_image(doc): image_html = '' list = [l for l in extract_patterns.split('|')] """for i in list: p = i.split(':') pattern = Pattern(p[0], p[1]) patterns.append(pattern)""" patterns = [Pattern(i.split(':')[0], i.split(':')[1]) for i in list] for p in patterns: try: if p.pattern_type == 'id': d = doc.get_element_by_id(p.pattern_value) image_html = html.tostring(d).strip() if len(image_html) > 0 and _image_count(image_html): break elif p.pattern_type == 'class': d = doc.find_class(p.pattern_value) if d: image_html = html.tostring(d[0]) if len(image_html) > 0 and _image_count(image_html) > 0: break except Exception, ex: continue
def _parse(self, body): q = {} doc = HTML.fromstring(body) qbox = doc.xpath("//*[@id=\"question-box\"]")[0] qtitle = qbox.xpath(".//h1[@id='question-title']//span")[1] qbody = qbox.xpath(".//*[@id=\"question-content\"]")[0] #get question q["title"] = qtitle.text_content() self.title = q["title"] self.rkeys.append(self.title) q["body"] = qbody.text_content() anwsers = [None,None] #0 best anwser, 1 recommended anwser, 2-more other anwser '''get best anwser''' bae = doc.xpath("//*[@id='best-answer-panel']") if bae: ba = bae[0].xpath(".//*[@class='content']")[0] ba = HTML.tostring(ba,encoding="utf-8") anwsers[0] = ba '''get recommended anwser''' rae = doc.xpath("//*[@id='recommend-answer-panel']") if rae: ra = rae[0].xpath(".//*[@class='content']")[0] ra = HTML.tostring(ra,encoding="utf-8") anwsers[1] = ra '''get other anwsers''' oae = doc.xpath("//*[@id='reply-panel']") if oae: aes = oae[0].xpath(".//*[@class='content']") for aei in aes: anwsers.append(HTML.tostring(aei,encoding="utf-8")) q["anwsers"] = anwsers return q
def LoadFP(): check = getURL(ROOT_URL, False) if check[1] != {None:None}: # Needed Authentication ctTV-Main Page ctTV_Main = HTML.ElementFromURL(ROOT_URL, headers=check[1], cacheTime=0, encoding="Latin-1", errors="ignore") else: ctTV_Main = HTML.ElementFromURL(ROOT_URL, cacheTime=0, encoding="Latin-1", errors="ignore") # Read a string version of the page ctTV_MainString = cleanHTML(urllib2.urlopen(check[0]).read()) # Get some MAIN Meta-Data of c't TV: mainTitle = ctTV_Main.xpath("/html/body/div[@id='navi_top']/div[1]/ul[1]/li[2]/a")[0] mainTitle = tostring(mainTitle).split('">')[1][:-4].replace('<span>','').replace('</span>','').encode('Latin-1').decode('utf-8') mainSubtitle = ctTV_Main.xpath("/html/body/div[@id='navi_top']/div[1]/ul[3]/li[4]/a")[0].text.encode('Latin-1').decode('utf-8') # Define current video currentVideoTitle1 = ctTV_Main.xpath("//*[@id='hauptbereich']/div[@id='video']/h1/text()")[0].encode('Latin-1').decode('utf-8') currentVideoTitle2 = ctTV_Main.xpath("//*[@id='hauptbereich']/div[@id='video']/h1")[0] currentVideoTitle2 = tostring(currentVideoTitle2).split('|')[1].split('<')[0].encode('Latin-1').decode('utf-8') currentVideoTitle = currentVideoTitle1 + '|' + currentVideoTitle2 currentVideoURL = ROOT_URL themes = getThemes(ctTV_Main) topics = getTopics(ctTV_Main) archive = getArchive(ctTV_MainString) return (mainTitle, mainSubtitle, currentVideoTitle, currentVideoURL, themes, topics, archive)
def replace_terms(html): html = force_text(html) remove_body = False remove_p = False etree = parse(StringIO(html)) root_node = etree.getroot() if not _looks_like_full_html_unicode(html): root_node = root_node.getchildren()[0] remove_body = True if root_node.getchildren()[0].tag == 'p' and html[:3] != '<p>': remove_p = True variants_dict = Term.objects.variants_dict() replace_dict = Term.objects.replace_dict() replace_regexp = Term.objects.replace_regexp() replace_regexp__sub = replace_regexp.sub translate = get_translate_function(replace_dict, variants_dict) for node in get_interesting_contents(root_node, replace_regexp): new_content = replace_regexp__sub( translate, tostring(node, encoding='unicode')) new_node = parse(StringIO(new_content)).getroot().getchildren()[0] if node.tag != 'body': new_node = new_node.getchildren()[0] node.getparent().replace(node, new_node) if remove_body: if remove_p: root_node = root_node.getchildren()[0] out = root_node.text or '' out += ''.join([tostring(node, encoding='unicode') for node in root_node.getchildren()]) return out return tostring(etree, encoding='unicode')
def myparser(reviewObj, element): populateReviewerInfo(reviewObj, element); #date tempList = element.cssselect('.review-meta .date') date = '' if (len(tempList) > 0): date = html.tostring(tempList[0], method='text', encoding=unicode).strip() reviewObj.setReviewDate(date) #comment tempList = element.cssselect('.externalReview .review_comment') comment = '' if (len(tempList) > 0): tempElement = html.fragment_fromstring(html.tostring(tempList[0]).replace('<br>', ' ').replace('<br/>', ' ').replace('<BR>', ' ').replace('<BR/>', ' ')) comment = html.tostring(tempElement, method='text', encoding=unicode).strip() reviewObj.setReviewText(comment) #rating tempList = element.cssselect('.externalReview .review-meta .rating meta') rating = '' if (len(tempList) > 0): rating = tempList[0].get('content') reviewObj.setReviewRating(rating)
def copy_chapters_across_with_fixes(chapter_info, fixed_toc): comments_html = open('disqus_comments.html').read() buy_book_div = html.fromstring(open('buy_the_book_banner.html').read()) analytics_div = html.fromstring(open('analytics.html').read()) load_toc_script = open('load_toc.js').read() for chapter in CHAPTERS: old_contents = open(chapter).read() new_contents = fix_xrefs(old_contents, chapter, chapter_info) new_contents = fix_title(new_contents, chapter, chapter_info) parsed = html.fromstring(new_contents) body = parsed.cssselect('body')[0] if parsed.cssselect('#header'): head = parsed.cssselect('head')[0] head.append(html.fragment_fromstring('<script>' + load_toc_script + '</script>')) body.set('class', 'article toc2 toc-left') body.insert(0, buy_book_div) body.append(html.fromstring( comments_html.replace('CHAPTER_NAME', chapter.split('.')[0]) )) body.append(analytics_div) fixed_contents = html.tostring(parsed) with open(DEST / chapter, 'w') as f: f.write(fixed_contents.decode('utf8')) with open(DEST / 'toc.html', 'w') as f: f.write(html.tostring(fixed_toc).decode('utf8'))
def results(self, query, pages_max=1): for page in range(1, pages_max + 1): if page > 1: if not self._next(page): break else: self.browser.submit_form(self.url, fields={'q': query}) for li in self.browser.cssselect('li.g', []): log = html.tostring(li, pretty_print=True)[:1000] links = li.cssselect('a') if not links: logger.error('failed to get links from %s', log) continue url = links[0].get('href') if not url or not urlparse(url).scheme: continue title = clean(self.get_link_text(html.tostring(links[0]))) if not title: continue yield { 'title': title, 'url': url, 'page': page, }
def expect_redirect(step, from_url, to_url): """Go to a url and expect a 302, check the DOM returned == expected DOM. Bit weird this one, and might not be useful. The :py:data:`to_url` you are expecting to eventually reach (via the :py:data:`from_url`) is first hit in the normal fashion and the DOM is saved to a string. The :py:data:`from_url` is then hit, checked for a 302 and the eventual DOM is compared to the stored one. If Selenium is used, the :py:data:`from_url` is hit, and waited for as per :py:func:`access_url`. """ step.given('I access the url "%s"' % to_url) expected_dom_str = html.tostring(world.dom) response = world.browser.get(from_url) code = response.status_code assert code == 302, \ "Failed to get a 302 for %s, got %s" % (from_url, code) response = world.browser.get(from_url, follow=True) world.dom = html.fromstring(response.content) world.templates = [t.name for t in response.template] assert html.tostring(world.dom) == expected_dom_str, \ "Expected DOM doesn't match redirected DOM" if world.using_selenium: world.sel.open(from_url) world.sel.wait_for_page_to_load(world.timeout)
def word_def(word): word = word.lower() phrase = '-'.join(word.split()) words = [word, phrase, '%s_1' % word, '%s_1' % phrase] cursor = db.words.find({'word': {'$in': words}}) if cursor.count(): w = cursor.next() return jsonify(word=word, content=w['content'], related=w['related']) try: word_define = urlopen('%s/dictionary/%s' % (app.config['URL'], word)) except: abort(404) doc = etree.HTML(word_define.read()) if '/spellcheck/?q' in word_define.url: content = polish(tostring(doc.xpath( "/html/body/div[@id='ox-container']" "/div[@id='ox-wrapper']/div[@id='main_column']")[0])).strip() related = '#' else: contentElem = doc.xpath( "/html/body/div[@id='ox-container']/div[@id='ox-wrapper']" "/div[@id='main_column']/div[@id='main-container']" "/div[@id='entryContent']")[0] content = polish(tostring(contentElem)).strip() related = polish(tostring(doc.xpath( "/html/body/div[@id='ox-container']/div[@id='leftcolumn']" "/div[@id='relatedentries']")[0])).strip() thread = DocParseThread(contentElem, content, related) thread.start() g.thread = thread return jsonify(word=word, content=content, related=related)
def get_forms(request): # take passed url, use regular expression to capture domain def get_store_name(url): store_name = re.search(r'http://www.(\w*).', url).group(1) return store_name if request.is_ajax(): product_url = request.POST['product_url'] store_name = get_store_name(product_url) form_list = [] # setup the browser object b = mechanize.Browser() b.set_handle_robots(False) b.set_proxies({'http': 'api.crawlera.com'}) b.add_proxy_password("jquintal","we8GeegieR") # fetch page and open in lxml b_response = b.open(product_url) html = b_response.read() tree = lh.fromstring(html) # fetch forms if(store_name == "target"): forms = tree.cssselect('.order-item') for form in forms: form_list.append(lh.tostring(form)) elif(store_name == "radioshack"): # GET RADIO SHACK FORMS # NOTE: RadioShack appears to have no forms other than quantity, meaning: they list all product variants as separate entries EX: Beats Pill Blue, Beats Pill Red, etc. pass elif(store_name == "amazon"): # GET AMAZON FORMS pass elif(store_name == "toysrus"): forms = tree.cssselect('#buyInterior') for div in forms: form_list.append(lh.tostring(div)) elif(store_name == "tigerdirect"): forms = tree.cssselect('.prodAction') for form in forms: form_list.append(lh.tostring(form)) elif(store_name == "overstock"): forms = tree.cssselect('#addCartWrap_addCartMain') for form in forms: form_list.append(lh.tostring(form)) elif(store_name == "newegg"): # NOT YET WORKING forms = tree.cssselect('.grpQty') for form in forms: form_list.append(lh.tostring(form)) return render_to_response('cart/store_forms.html', {'forms': form_list})
def copy_chapters_across_fixing_xrefs(chapter_info, fixed_toc): comments_div = html.fromstring(open('disqus_comments.html').read()) buy_book_div = html.fromstring(open('buy_the_book_banner.html').read()) analytics_div = html.fromstring(open('analytics.html').read()) load_toc_script = open('load_toc.js').read() for chapter in CHAPTERS: new_contents = fix_xrefs(chapter, chapter_info) parsed = html.fromstring(new_contents) body = parsed.cssselect('body')[0] if parsed.cssselect('#header'): head = parsed.cssselect('head')[0] head.append(html.fragment_fromstring('<script>' + load_toc_script + '</script>')) body.set('class', 'article toc2 toc-left') body.insert(0, buy_book_div) body.append(comments_div) body.append(analytics_div) fixed_contents = html.tostring(parsed) target = os.path.join('/home/harry/workspace/www.obeythetestinggoat.com/content/book', chapter) with open(target, 'w') as f: f.write(fixed_contents.decode('utf8')) toc = '/home/harry/workspace/www.obeythetestinggoat.com/content/book/toc.html' with open(toc, 'w') as f: f.write(html.tostring(fixed_toc).decode('utf8'))
def download_user_review(url): try: if url.find("http://www.gamespot.com") == -1: url = "http://www.gamespot.com" + url f = urllib.urlopen(url) except: traceback.print_exc() ipdb.set_trace() try: page = html.parse(f) root = page.getroot() meta = html.tostring(root.cssselect("#player_review div.body div.user_reviews")[0]) if len(root.cssselect("#player_score_details div.body dl.review_details")) > 0: score_details = html.tostring(root.cssselect("#player_score_details div.body dl.review_details")[0]) else: score_details = "No Details" body = html.tostring(root.cssselect("#player_review_body")[0]) ret = {} ret['meta'] = meta ret['score_details'] = score_details ret['body'] = body return ret #ipdb.set_trace() except: traceback.print_exc() ipdb.set_trace()
def any_html_to_string(self, cls, value): return html.tostring(value)
with open('./rebuild.sh') as f: lines = f.readlines()[2:] t = [x.strip('# \n') for x in lines[-12::2]] dic = [(x.split('|')[0].decode('utf-8'), x.split('|')[1]) for x in t] dic = dic[::-1] with open('./html/index.html') as f: root = fromstring(f.read()) ul = root.xpath('//ul[@id="toc"]')[0] lis = ul.xpath('./li') # 移除li for li in lis: li.getparent().remove(li) # 生成新的li for name, date in dic: li = etree.Element('li') a = etree.Element('a', href='./' + name + '.html') a.text = name span = etree.Element('span') span.set("class", 'time') span.text = date a.append(span) li.append(a) ul.append(li) with open('./html/index.html', 'wb') as f: f.write(tostring(root, encoding='unicode').encode('utf-8'))
def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == '': return msg_body # Fix bad HTML caused by weird encoding issues if msg_body.count('=3D') > 2: # it's unlikely this was intentional msg_body = msg_body.replace('=3D', '=') # also get rid of trailing equals; in doing so, strip newlines # as there may have been spurious ones inserted in the middle of tags msg_body = msg_body.replace('=\n', '') try: html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8") ) except etree.ParserError: # Malformed HTML, don't try to strip. return msg_body cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree) ) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] msg_with_checkpoints = html.tostring(html_tree) h = html2text.HTML2Text() h.body_width = 0 # generate plain text without wrap # html2text adds unnecessary star symbols. Remove them. # Mask star symbols msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') plain_text = h.handle(msg_with_checkpoints) # Remove created star symbols plain_text = plain_text.replace('*', '') # Unmask saved star symbols plain_text = plain_text.replace('3423oorkg432', '*') delimiter = get_delimiter(plain_text) plain_text = preprocess(plain_text, delimiter, content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)] for line in lines] # Remove checkpoints lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines for i in xrange(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: if cut_quotations: return html.tostring(html_tree_copy) else: return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags( html_tree_copy, 0, quotation_checkpoints ) return html.tostring(html_tree_copy)
new_page = requests.get(notebook_page) current_status = new_page.status_code if current_status != 404: # Get image and text if any new_tree = html.fromstring(new_page.content) if new_tree.xpath('count(//textarea)') != 0: page_text = new_tree.xpath('//textarea/text()') with open(new_path + 'data/' + page_number + ".txt", "w") as d: d.write(page_text[0].encode("utf-8")) d.close() if new_tree.xpath('count(//table/tr/th[2])') != 0: page_image = html.tostring( new_tree.xpath('//table/tr/th[2]')[0]) image_text = page_image[page_image.find("src=") + 5:page_image. find("jpg", page_image.find("src=")) + 3] r = requests.get(root_page + notebook_links[count] + image_text) i = Image.open(StringIO(r.content)) i.save(new_path + 'images/' + image_text) count += 1
#!/usr/bin/env python3 # -*- coding: utf-8 -*- __author__ = 'ipetrash' from lxml import html root = html.fromstring('<p>Hello<br>world!</p><br>') print(html.tostring(root)) # b'<div><p>Hello<br>world!</p><br></div>' print(html.tostring(root, encoding='unicode')) # <div><p>Hello<br>world!</p><br></div> print(html.tostring(root, pretty_print=True)) # b'<div>\n<p>Hello<br>world!</p>\n<br>\n</div>\n' print(html.tostring(root, encoding='unicode', pretty_print=True)) # <div> # <p>Hello<br>world!</p> # <br> # </div>
def inner_html(node): """ get original html from a node """ return (node.text or '') + ''.join( [html.tostring(child) for child in node.iterchildren()])
def get_html_from_element(element): return tostring(element)
def semanticize(doc_path='test.html'): """ P: unbroken set of lines (.t divs) of the same look make one <p> H1-3: Top 3 kinds of font size are turned to h1, h2 and h3. TABLE: use x and y position to indicate <td>, TODO: colspan support """ print(doc_path) dom, dimensions = prepare(doc_path) get_dimension = lambda el, dim_type: dimensions[dim_type].get( classN(dim_type, el)) or 0 # recover text from embedded fonts with bad CMAPS if > 50% of characters are unicode PUA recover = pua_content(dom.text_content()) > 0.5 if recover: print('Recovery needed, not now.') return recover_text(dom, os.path.dirname(doc_path)) # remove paging headers if REMOVE_HEADERS: dom = remove_headers(dom) # remove javascript holders for div in dom.cssselect('.j'): remove(div) if TABLES: table_data = grid_data(dom, get_dimension) dom = reconstruct_tables(dom, table_data) h_levels = heading_levels(dom, dimensions) # line by line analysis and conversion p_look = p_height = p_space = p_tag = box = 0 for l in dom.cssselect('.t'): # Gather information about this line to see if it's part of a block. # 1. detect change of look - different css classes from previous line look = ' '.join([ c for c in l.attrib['class'].split() if c[0] != 'y' and c[0:2] != 'fc' ]) # ignore y pos and font color new_look = p_look != look # 2. detect change of margin height - larger difference in bottom position from previous line height = get_dimension(l, 'h') line_height = p_height - height margin = line_height > MAX_LINE_HEIGHT # 3. space above - preceding empty line space = not l.text_content().strip() # Based on collected info: does this line belong to previous line? append = new_look == p_space == margin == False txt = l.text_content() tag = 'p' # LI indent = 'x0' not in look # there is some indentation if [1 for b in BULLETS if txt.startswith(b)]: tag = 'li' append = 0 elif indent and p_tag == 'li': tag = 'li' append = 1 # H1, H2... size = classN('fs', l) if size in h_levels.keys(): append = 0 tag = 'h%s' % h_levels[size] # merge multiline-elements if txt.strip(): if append: if BR: box.append(Element('br')) box.append(l) else: box = l l.tag = tag else: remove(l) if DEBUG: mark = ('<%s>' % tag).ljust(5) if append: mark = 5 * ' ' print(' Aa %d ⇪ %d ⇕ % 3d %s %s %s' %\ (new_look, p_space, line_height, l.attrib['class'].ljust(40), mark, txt)) # save current values for comparison in the next loop iteration p_space, p_height, p_look, p_tag = space, height, look, tag wrap_set(dom, 'li', 'ul') if STRIP_CSS: for e in dom.cssselect("style"): remove(e) for attr in 'style id class data-page-no data-data'.split(): for e in dom.cssselect("*"): try: del e.attrib[attr] except KeyError: pass # save file html = tostring(dom, encoding=ENCODING, pretty_print=True).decode(ENCODING) s = '<!DOCTYPE html>' + html for a, b in REPLACE_AFTER: s = re.sub(a, b, s) for rm in REMOVE_AFTER: s = re.sub(rm, '', s) for b in BULLETS: s = s.replace(b, '') if recover: for rm in REMOVE_BEFORE: s = re.sub(rm, '', s) # New file is .htm, not .html save_path = doc_path[:-1] f = open(save_path, 'w', encoding=ENCODING) f.write(s) f.close()
pre = content.makeelement('pre', {'class': 'converted-comment'}) pre.text = c.text c.getparent().replace(c, pre) else: logger.warn('Removing commment') c.getparent().remove(c) # Convert style="text-align: right" to class for tag in content.xpath("//*[starts-with(@style, 'text-align: right')]"): logger.debug('Converting "text-align: right" to class') del tag.attrib['style'] tag.attrib['class'] = 'text-right' # Convert style="text-align: center" to class for tag in content.xpath("//*[starts-with(@style, 'text-align: center')]"): logger.debug('Converting "text-align: center" to class') del tag.attrib['style'] tag.attrib['class'] = 'text-center' # Check for missed style attributes for tag in content.xpath("//*[@style]"): logger.warn('Found remaining style attribute') sys.exit('Giving up') chapter = Chapter() chapter.html = HTML(html.tostring(content), encoding='utf-8') chapter.title = page['title'] book.sections.append(chapter) book.make(book.title + '.epub')
def examine_meta(tree): '''Search meta tags for relevant information''' metadata = dict.fromkeys(METADATA_LIST) # bootstrap from potential OpenGraph tags title, author, url, description, site_name = extract_opengraph(tree) # test if all return values have been assigned if all((title, author, url, description, site_name)): # if they are all defined metadata['title'], metadata['author'], metadata['url'], metadata[ 'description'], metadata[ 'sitename'] = title, author, url, description, site_name return metadata tags = [] # skim through meta tags for elem in tree.iterfind('.//head/meta[@content]'): # content if not elem.get('content'): continue content_attr = elem.get('content') # image info # ... # property if 'property' in elem.attrib: # no opengraph a second time if elem.get('property').startswith('og:'): continue if elem.get('property') == 'article:tag': tags.append(content_attr) elif elem.get('property') in ('author', 'article:author'): if author is None: author = content_attr # name attribute elif 'name' in elem.attrib: name_attr = elem.get('name').lower() # author if name_attr in ('author', 'byl', 'dc.creator', 'dcterms.creator', 'sailthru.author'): # twitter:creator if author is None: author = content_attr # title elif name_attr in ('title', 'dc.title', 'dcterms.title', 'fb_title', 'sailthru.title', 'twitter:title'): if title is None: title = content_attr # description elif name_attr in ('description', 'dc.description', 'dcterms.description', 'dc:description', 'sailthru.description', 'twitter:description'): if description is None: description = content_attr # site name elif name_attr in ('publisher', 'dc.publisher', 'dcterms.publisher', 'twitter:site', 'application-name' ) or 'twitter:app:name' in elem.get('name'): if site_name is None: site_name = content_attr # url elif name_attr == 'twitter:url': if url is None and validate_url(content_attr)[0] is True: url = content_attr # keywords elif name_attr == 'keywords': # 'page-topic' tags.append(content_attr) elif 'itemprop' in elem.attrib: if elem.get('itemprop') == 'author': if author is None: author = content_attr elif elem.get('itemprop') == 'description': if description is None: description = content_attr elif elem.get('itemprop') == 'headline': if title is None: title = content_attr # to verify: #elif elem.get('itemprop') == 'name': # if title is None: # title = elem.get('content') # other types else: if not 'charset' in elem.attrib and not 'http-equiv' in elem.attrib and not 'property' in elem.attrib: LOGGER.debug( html.tostring(elem, pretty_print=False, encoding='unicode').strip()) metadata['title'], metadata['author'], metadata['url'], metadata[ 'description'], metadata['sitename'], metadata[ 'tags'] = title, author, url, description, site_name, tags return metadata
import scraperwiki from mechanize import Browser from BeautifulSoup import BeautifulSoup from lxml import html import time mech = Browser() url = "http://www.bnm.gov.my/index.php?ch=12&pg=852" page = mech.open(url) html1 = page.read() tree = html.fromstring(html1) table, = tree.xpath('//*[.="Tenure"]/ancestor::table[1]') soup1 = BeautifulSoup(html.tostring(table)) table = soup1.find("table") now = time.time() for row in table.findAll('tr')[1:]: col = row.findAll('td') data = { 'time': now, 'Tenure': col[0].string, 'Buying': col[1].string, 'Selling': col[2].string } scraperwiki.sqlite.save(unique_keys=['time'], data=data) now = now + 1 import scraperwiki from mechanize import Browser from BeautifulSoup import BeautifulSoup from lxml import html import time
def sanitize_html(html): html = html5parser.fragment_fromstring(html, create_parent="div") html = cleaner.clean_html(tostring(html)).decode() return html
def show_tip_filter(qa_html, qa, dummy_fields, dummy_model, dummy_data, dummy_col): """ Filter the answers to add the kanji diagram pop-ups. """ if not question_tips and not qa == 'a': return qa_html global do_show global current_script do_show = False current_script = show_tips_script try: doc = html.fromstring(qa_html) except: return qa_html elements = [] for ts in tip_selectors: elements += doc.cssselect(ts) elements = uniqify_list(elements) for el in elements: skip_elements = [] for skip_sel in skip_selectors: skip_elements += el.cssselect(skip_sel) skip_elements = uniqify_list(skip_elements) for sub_el in el.iter(): if sub_el in skip_elements: continue if sub_el.text is not None: bad_chars = media_characters(sub_el.text) new_index = 0 new_element = None tip_text = u'' sub_e_t = sub_el.text for i, g in enumerate(sub_e_t): if i in bad_chars: tip_text += g continue ge = maybe_make_tip(g) if ge is not None: do_show = True if new_element is None: sub_el.text = tip_text else: # new_element is the old new element... new_element.tail = tip_text sub_el.insert(new_index, ge) new_index += 1 new_element = ge tip_text = u'' else: tip_text += g if new_element is not None: new_element.tail = tip_text if sub_el is not el and sub_el.tail is not None: # We have to skip the tail of the element that # trigered the selector. That is *not* in the # selector. bad_chars = media_characters(sub_el.tail) parent = sub_el.getparent() new_index = parent.index(sub_el) + 1 new_element = None tip_tail = u'' sub_e_t = sub_el.tail for i, g in enumerate(sub_e_t): if i in bad_chars: tip_tail += g continue ge = maybe_make_tip(g) if ge is not None: do_show = True if new_element is None: sub_el.tail = tip_tail else: new_element.tail = tip_tail # We have to inser this into the parent, not # into this sub_el. parent.insert(new_index, ge) new_index += 1 new_element = ge tip_tail = u'' else: tip_tail += g if new_element is not None: new_element.tail = tip_tail if do_show: head = doc[1] jqui_style = html.Element('link') jqui_style.set('type', 'text/css') jqui_style.set('rel', 'stylesheet') jqui_style.set('href', jqui_style_path) jqui_style.tail = '\n' head.append(jqui_style) jqui_theme_style = html.Element('link') jqui_theme_style.set('type', 'text/css') jqui_theme_style.set('rel', 'stylesheet') jqui_theme_style.set('href', jqui_theme_style_path) jqui_theme_style.tail = '\n' head.append(jqui_theme_style) tt_style = html.Element('link') tt_style.set('type', 'text/css') tt_style.set('rel', 'stylesheet') tt_style.set('href', tips_style_path) tt_style.tail = '\n' head.append(tt_style) return unicode(urllib.unquote(html.tostring(doc, encoding='utf-8')), 'utf-8')
def getSoup(self, link): start = requests.get(link) tree = html.fromstring(start.text) soup = BeautifulSoup(html.tostring(tree)) return soup
def post2rss(post, digest=False, pic=None, extra_types=()): """ :param post (dict): 帖子数据 :param digest (bool): 输出摘要 :param pic (str): pic=cf 或 pic=google:指定图片代理提供方 :param extra_types (tuple): 除回答和文章之外的其他帖子类型 :return: PyRSS2Gen.RSSItem: post RSS item """ if post['type'] == 'answer': title = '[回答] %s' % post['question']['title'] url = 'https://www.zhihu.com/question/%s/answer/%s' % ( post['question']['id'], post['id']) t_c = post['created_time'] author = post['author']['name'] elif post['type'] == 'article': title = '[文章] %s' % post['title'] url = 'https://zhuanlan.zhihu.com/p/%s' % post['id'] t_c = post['created'] author = post['author']['name'] elif post['type'] == 'pin': title = '[想法] %s' % post['excerpt_title'] url = 'https://www.zhihu.com/pin/%s' % post['id'] t_c = post['created'] author = post['author']['name'] elif 'question' in extra_types and post['type'] == 'question': title = '[问题] %s' % post['title'] url = 'https://www.zhihu.com/question/%s' % (post['id']) t_c = post['created'] author = None elif post['type'] == 'ANSWER_VOTE_UP': title = '[赞同了回答] %s by %s' % (post['question']['title'], post['author']['name']) url = 'https://www.zhihu.com/question/%s/answer/%s' % ( post['question']['id'], post['id']) t_c = post['vote_up_time'] author = post['author']['name'] elif post['type'] == 'MEMBER_VOTEUP_ARTICLE': title = '[赞同了文章] %s by %s' % (post['title'], post['author']['name']) url = 'https://zhuanlan.zhihu.com/p/%s' % post['id'] t_c = post['vote_up_time'] author = post['author']['name'] elif post['type'] == 'QUESTION_ANSWER': title = '%s 的回答' % post['author']['name'] url = 'https://www.zhihu.com/question/%s/answer/%s' % ( post['question']['id'], post['id']) t_c = post['created_time'] author = post['author']['name'] elif post['type'] == 'MEMBER_COLLECT_ANSWER': title = '[收藏了回答] %s by %s' % (post['question']['title'], post['author']['name']) url = 'https://www.zhihu.com/question/%s/answer/%s' % ( post['question']['id'], post['id']) t_c = post['created_time'] author = post['author']['name'] elif post['type'] == 'MEMBER_COLLECT_ARTICLE': title = '[收藏了文章] %s by %s' % (post['title'], post['author']['name']) url = 'https://zhuanlan.zhihu.com/p/%s' % post['id'] t_c = post['created'] author = post['author']['name'] elif post['type'] in ['roundtable', 'live', 'column']: return else: logger.warn('unknown type: %s', post['type']) return if post['type'] == 'pin': content = pin_content(post) else: content = post_content(post, digest) if post['type'] == 'ANSWER_VOTE_UP': content += "<p>回答发布于 %s </p>" % (datetime.datetime.utcfromtimestamp( post['created_time']).strftime('%Y-%m-%d %H:%M:%S')) content += "<p>回答编辑于 %s </p>" % (datetime.datetime.utcfromtimestamp( post['updated_time']).strftime('%Y-%m-%d %H:%M:%S')) elif post['type'] == 'MEMBER_VOTEUP_ARTICLE': content += "<p>文章发布于 %s </p>" % (datetime.datetime.utcfromtimestamp( post['created']).strftime('%Y-%m-%d %H:%M:%S')) content += "<p>文章编辑于 %s </p>" % (datetime.datetime.utcfromtimestamp( post['updated']).strftime('%Y-%m-%d %H:%M:%S')) else: pass content = content.replace('<code ', '<pre><code ') content = content.replace('</code>', '</code></pre>') # Post only contains images but no text if not content: content = '<img src="%s">' % post.get('thumbnail') doc = fromstring(content) tidy_content(doc) if pic: base.proxify_pic(doc, re_zhihu_img, pic) content = tostring(doc, encoding=str) pub_date = datetime.datetime.utcfromtimestamp(t_c) item = PyRSS2Gen.RSSItem( title=title.replace('\x08', ''), link=url, guid=url, description=content.replace('\x08', ''), pubDate=pub_date, author=author, ) return item
def linkedin_companies_parser(url): for i in range(1): try: headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' } print "Fetching :", url response = requests.get(url, headers=headers, verify=False) print response.content print response.status_code print response.headers print response.history formatted_response = response.content.replace('<!--', '').replace( '-->', '') doc = html.fromstring(formatted_response) print html.tostring(doc) ''' datafrom_xpath = doc.xpath('//code[@id="stream-promo-top-bar-embed-id-content"]//text()') content_about = doc.xpath('//code[@id="stream-about-section-embed-id-content"]') print('weird') if not content_about: content_about = doc.xpath('//code[@id="stream-footer-embed-id-content"]') if content_about: pass # json_text = content_about[0].html_content().replace('<code id="stream-footer-embed-id-content"><!--','').replace('<code id="stream-about-section-embed-id-content"><!--','').replace('--></code>','') if datafrom_xpath: try: json_formatted_data = json.loads(datafrom_xpath[0]) company_name = json_formatted_data['companyName'] if 'companyName' in json_formatted_data.keys() else None size = json_formatted_data['size'] if 'size' in json_formatted_data.keys() else None industry = json_formatted_data['industry'] if 'industry' in json_formatted_data.keys() else None description = json_formatted_data['description'] if 'description' in json_formatted_data.keys() else None follower_count = json_formatted_data['followerCount'] if 'followerCount' in json_formatted_data.keys() else None year_founded = json_formatted_data['yearFounded'] if 'yearFounded' in json_formatted_data.keys() else None website = json_formatted_data['website'] if 'website' in json_formatted_data.keys() else None type = json_formatted_data['companyType'] if 'companyType' in json_formatted_data.keys() else None specialities = json_formatted_data['specialties'] if 'specialties' in json_formatted_data.keys() else None if "headquarters" in json_formatted_data.keys(): city = json_formatted_data["headquarters"]['city'] if 'city' in json_formatted_data["headquarters"].keys() else None country = json_formatted_data["headquarters"]['country'] if 'country' in json_formatted_data['headquarters'].keys() else None state = json_formatted_data["headquarters"]['state'] if 'state' in json_formatted_data['headquarters'].keys() else None street1 = json_formatted_data["headquarters"]['street1'] if 'street1' in json_formatted_data['headquarters'].keys() else None street2 = json_formatted_data["headquarters"]['street2'] if 'street2' in json_formatted_data['headquarters'].keys() else None zip = json_formatted_data["headquarters"]['zip'] if 'zip' in json_formatted_data['headquarters'].keys() else None street = street1 + ', ' + street2 else: city = None country = None state = None street1 = None street2 = None street = None zip = None data = { 'company_name': company_name, 'size': size, 'industry': industry, 'description': description, 'follower_count': follower_count, 'founded': year_founded, 'website': website, 'type': type, 'specialities': specialities, 'city': city, 'country': country, 'state': state, 'street': street, 'zip': zip, 'url': url } return data except: print "cant parse page", url ''' # Retry in case of captcha or login page redirection if len(response.content) < 2000 or "trk=login_reg_redirect" in url: if response.status_code == 404: print("linkedin page not found") else: raise ValueError( 'redirecting to login page or captcha found') except Exception as e: print str(e) print "retrying :", url
def lxmlTable2Pandas3(*args, **kwargs): kwargs['Data'] = pd.read_html(lh.tostring(kwargs['Data']))[0] print(kwargs['Data']) return kwargs
def get_body(self): body_ele = self.tree.xpath("//div[contains(@id,'qnaContainer-')]") if body_ele is None: return None body_ele = body_ele[0] return html.tostring(body_ele, pretty_print=True).decode()
def innerHTML(el): if el is None: return '' return (el.text or '') + ''.join( html.tostring(x, encoding="unicode") for x in el)
def outerHTML(el): if el is None: return '' return html.tostring(el, with_tail=False, encoding="unicode")
def any_html_to_unicode(self, cls, value, **_): return html.tostring(value, encoding='unicode')
def get_articles(self): headers = { 'Pragma': 'no-cache', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'http://www.nlpjob.com/', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', } # for page in range(self.page, self.max_page+1): for page in range(self.page, 30): # TODO check if next page exist print('\ngo to page:', page) with open('.progress.json', 'w+') as f: json.dump({'page': page}, f) params = {'p': page} try: a = time.time() resp = requests.get(self.base_url, headers=headers, params=params) b = time.time() print(b - a) except Exception as e: print(e) sys.exit(1) else: if resp.status_code != 200: print('code != 200') sys.exit(1) if '全部职位' not in resp.text: print('not in the right page') print('current page:', resp.url) sys.exit(1) tree = html.fromstring(resp.text) articles = tree.xpath('//div[contains(@class, "row")]') print('count:', len(articles)) for article in articles: article = html.fromstring(html.tostring(article)) publish_time = article.xpath('//span[@class="time-posted"]' )[0].text_content().strip() if '2017-06-27' in publish_time: sys.exit(1) href = article.xpath('//span[@class="row-info"]/a/@href') title = title = article.xpath( '//span[@class="row-info"]/a/text()') if href and title: href = href[0] title = title[0].strip() else: break id = href.split('/')[4] article_json = { 'id': id, 'href': href, 'title': title, 'publishTime': publish_time, 'status': 'not_done' } # pprint(article_json) if not self.col.find_one({'id': id}): self.col.insert_one(article_json) # break ### for debug return True
def reader(link, start, last, folder): #link = "https://www.wuxiaworld.com/novel/against-the-gods/atg-chapter-0" found = "" f = requests.get(link) page = html.fromstring(f.text) p = html.tostring(page).decode('utf-8') p = p.replace("“", '"') p = p.replace("…", "...") p = p.replace("”", '"') p = p.replace("’", "'") p = p.replace("–", "-") lines = p.splitlines() for i in range(len(lines)): if lines[i] == '<div class="fr-view">': found = lines[i + 1] break for i in range(len(lines)): if '/images/arrow-right.png' in lines[i]: nextchap = lines[i - 1] nextchap = re.search('"(.*)" class', nextchap).group(1) nextchap = "https://www.wuxiaworld.com" + nextchap break found = found.replace("</p><p>", "\n\n") found = found.replace("<p>", "") found = found.replace("</p>", "") found = found.replace("<strong>", "") found = found.replace("</strong>", "") name = found.splitlines()[0] if (("Chapter" not in name) or ("Previous" in name) or (len(name) > 45)): name = "Chapter " + str(start) name = name.translate(str.maketrans('', '', string.punctuation)) found = '\n'.join(found.splitlines()[1:]) found = '\n'.join(found.splitlines()[:-3]) file = open(folder + "/" + name + ".html", "w+") file.write( r'<style>p { font-family: Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif; font-size: 20px; font-style: normal; font-variant: normal; font-weight: 400; line-height: 25px; }</style>' ) file.write("<h2><strong><center>" + name + "</center></strong></h2>" + "<br>") file.write("<p>" + found.replace("\n\n", "</p><p>") + "</p>") file.close() options = { 'page-size': 'Executive', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', } path_wkthmltopdf = r'C:\\Program Files\\wkhtmltopdf\bin\\wkhtmltopdf.exe' config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf) pdfkit.from_file(folder + r'/' + name + ".html", folder + r'/' + name + ".pdf", options=options, configuration=config) os.remove(folder + "/" + name + ".html") if (start == last): return False else: print(nextchap) return reader(nextchap, start + 1, last, folder)
def any_html_to_bytes(self, cls, value, **_): return html.tostring(value)
def process_html(self, archive, context, text, target, options): # soup = fragment_fromstring(b'<article>' + text.encode('utf-8', 'replace') + b'</article>', create_parent=True) soup = fragment_fromstring(text, create_parent=True) escape = html.escape console = context[".console"] def write_error(insert_ref, el, msg, exc=None): log.error("insert '%s' failed; %s", insert_ref, msg) if context[".debug"]: if exc is not None: c = Console(text=True, width=120) c.obj(context, exc) _html = '<pre class="moya-insert-error"><code>{}</code></pre>'.format( escape(c.get_text())) else: _html = '<pre class="moya-insert-error"><code>{}</code></pre>'.format( escape(msg)) new_el = fromstring(_html) el.getparent().replace(el, new_el) else: el.getparent().remove(el) console.obj(context, exc) for el in self._selector(soup): try: insert_ref = el.attrib["insert"] except IndexError: write_error(el, "no 'insert' attribute in <moya> markup tag") app = None attribs = dict(el.attrib.items()) app_name = attribs.pop("app", None) or context.get( ".app.name", None) if app_name is None: write_error(insert_ref, el, "'app' attribute is required on <moya> tag") continue # Get data params params = {k.rsplit("-", 1)[-1]: v for k, v in attribs.items()} params.update(options) app = app or context.get(".app", None) if "#" in insert_ref: try: _app, insert_el = archive.get_element(insert_ref, app=app) except ElementNotFoundError as e: write_error( insert_ref, el, "markup insert element '{}' was not found".format( insert_ref), exc=e, ) continue else: from .tags.markup import MarkupInsert try: insert_el = MarkupInsert.registry[insert_ref] except KeyError: write_error( insert_ref, el, "markup insert element '{}' was not found".format( insert_ref), exc=e, ) continue _app = app if not getattr(insert_el, "_moya_markup_insert", False): msg = "{} is not safe for markup insertion".format( html.escape(insert_el)) write_error(insert_ref, el, msg) continue insert_callable = archive.get_callable_from_element(insert_el, app=_app) try: replace_markup = insert_callable(context, **params) except LogicError as e: write_error( insert_ref, el, "markup insert failed due to logic error, see logs", exc=e, ) continue except Exception as e: write_error(insert_ref, el, "markup insert failed, see logs", exc=e) continue new_el = fromstring(replace_markup) new_el.tail = el.tail el.getparent().replace(el, new_el) return HTML("".join(tostring(e).decode("utf-8") for e in soup))
def _upc(self): upc_list = re.search('upc : (\[[^\]]*\])', html.tostring(self.tree_html)).group(1) upc_list = ast.literal_eval(upc_list) return upc_list[0]
def main(): document = html.document_fromstring(sys.stdin.read()) for script in [s for s in document.getiterator('script')]: script.getparent().remove(script) print(html.tostring(document, encoding='unicode'))
def send(self, varBody=None): msg = "send" if varBody: msg = "%s('%s')" % ( msg, str(varBody), ) log.ThugLogging.add_behavior_warn("[Microsoft XMLHTTP ActiveX] %s" % (msg, )) log.ThugLogging.add_behavior_warn( "[Microsoft XMLHTTP ActiveX] Fetching from URL %s (method: %s)" % ( self.bstrUrl, self.bstrMethod, )) log.ThugLogging.log_exploit_event(self._window.url, "Microsoft XMLHTTP ActiveX", "Send", forward=False, data={ "method": self.bstrMethod, "url": str(self.bstrUrl) }) response = None self.dispatchEvent("loadstart") try: response = self._window._navigator.fetch( self.bstrUrl, method=self.bstrMethod, headers=self.requestHeaders, body=varBody, redirect_type="Microsoft XMLHTTP") except Exception: log.ThugLogging.add_behavior_warn( '[Microsoft XMLHTTP ActiveX] Fetch failed') self.dispatchEvent("timeout") self.dispatchEvent("error") if response is None: return 0 self.status = response.status_code self.responseHeaders = response.headers self.responseBody = response.content self.responseText = response.text self.readyState = 4 if getattr(log, 'XMLHTTP', None) is None: log.XMLHTTP = dict() log.XMLHTTP['status'] = self.status log.XMLHTTP['responseHeaders'] = self.responseHeaders log.XMLHTTP['responseBody'] = self.responseBody log.XMLHTTP['responseText'] = self.responseText log.XMLHTTP['readyState'] = self.readyState last_bstrUrl = log.XMLHTTP.get('last_bstrUrl', None) last_bstrMethod = log.XMLHTTP.get('last_bstrMethod', None) if last_bstrUrl in (self.bstrUrl, ) and last_bstrMethod in ( self.bstrMethod, ): # pragma: no cover return 0 log.XMLHTTP['last_bstrUrl'] = str(self.bstrUrl) log.XMLHTTP['last_bstrMethod'] = str(self.bstrMethod) if self.mimeType: contenttype = self.mimeType else: contenttype = self.responseHeaders.get('content-type', None) if contenttype is None: # pragma: no cover return 0 self.dispatchEvent("load") self.dispatchEvent("readystatechange") if 'javascript' in contenttype: html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT(response.text)))) doc = DOM.W3C.w3c.parseString(html) window = DOM.Window.Window(self.bstrUrl, doc, personality=log.ThugOpts.useragent) dft = DOM.DFT.DFT(window) dft.run() return 0 if 'text/html' in contenttype: tags = ('<html', '<body', '<head', '<script') if not any(tag in response.text.lower() for tag in tags): html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT( response.text)))) # pragma: no cover else: html = response.text doc = DOM.W3C.w3c.parseString(html) window = DOM.Window.Window(self.bstrUrl, doc, personality=log.ThugOpts.useragent) dft = DOM.DFT.DFT(window) dft.run() return 0 handler = log.MIMEHandler.get_handler(contenttype) if handler: handler(self.bstrUrl, self.responseBody) return 0
# read stdout filename = fp.readline().strip().split()[1].strip("'") perc = float(fp.readline().split(':')[1].split('%')[0]) gcov = fp.readline().strip().split()[1].strip("'") # move genereted gcov to coverage folder new_dir = join(target_dir, dirname(source)) try: makedirs(new_dir) except OSError: pass rename(join(obspy_dir, gcov), join(new_dir, gcov)) cov.append((filename, join(new_dir, gcov), perc)) # GENERATE HTML page = fromstring("<html><table></table></html>") table = page.xpath('.//table')[0] for name, gcov, perc in cov: td1, td2 = Element('td'), Element('td') gcov = gcov.replace(target_dir, './') a = Element('a', attrib={'href': gcov}) a.text = name td1.append(a) td2.text = "%6.2f%%" % perc tr = Element('tr') tr.extend([td1, td2]) table.append(tr) with open(join(target_dir, 'index.html'), 'wb') as fp: fp.write(tostring(page)) cleanup('*.o')
else: old_docs = old.docs[name] items = [] for key in old_docs.docs: old_id, old_title, old_xml = old_docs.docs[key] if key not in new_docs.docs: items.append(builder.I(builder.LI(old_title))) else: diffs = diff_xml(old_xml, new_docs.docs[key][2], verbose) if diffs is not None: title = builder.B(old_title) items.append(builder.LI(title, diffs)) if not items: body.append(builder.P(CHECK, OK)) else: body.append(builder.UL(*items)) parser = ArgumentParser() parser.add_argument("--old", required=True) parser.add_argument("--new", default=db.connect(user="******").cursor()) parser.add_argument("--verbose", action="store_true") opts = parser.parse_args() old = Data(opts.old) new = Data(opts.new, old) body = builder.BODY(builder.H1(TITLE)) compare_tables(body, old, new) compare_docs(body, old, new, opts.verbose) report = builder.HTML(HEAD, body) print(html.tostring(report, pretty_print=True).decode("ascii"))