def parse_body(self) -> str: """Look for article's body. :raise ~.ArticleBodyMissing: when no body is found. """ parser = CSSSelector('html body div#content div.sect1') body = parser(self.source) try: assert body except AssertionError: raise exceptions.ArticleBodyMissing(self) body = ''.join(lxml.etree.tounicode(section) for section in body) return body
async def get_image(queue): """When there is an item in queue, parse to get comics url""" while True: payload = await queue.get() await asyncio.sleep(0) body, selector = payload tree = html.fromstring(body) select = CSSSelector(selector) elements = [e.get('src') for e in select(tree)] if len(elements) > 0: url = elements[0] if not (url.startswith('http:') or url.startswith('https:')): url = 'https:' + url print(' ', url) # ack queue that the item has been processed queue.task_done()
def addImageList(root): """ Add an image list based on the <caption> tags """ images = list() count = 0 for caption in root.xpath('//span'): # <span> with image captions may contain several css classes. Unfortunately # BeautifulSoup is unable to find elements by-CSS-class if the related element # contains more than one CSS class if not 'image-caption-with-description' in caption.get('class', ''): continue text = caption.text_content() id = 'image-%d' % count new_anchor = lxml.html.Element('a') new_anchor.attrib['name'] = id caption.insert(0, new_anchor) images.append(dict(text=text, id=id)) count += 1 div_images = lxml.html.Element('div') div_images.attrib['id'] = 'images-list' div_ul = lxml.html.Element('ul') div_images.append(div_ul) if images: for d in images: li = lxml.html.Element('li') li.attrib['class'] = 'image-list-entry' a = lxml.html.Element('a') a.attrib['href'] = '#' + d['id'] a.attrib['class'] = 'image-list-entry' span = lxml.html.Element('span') span.text = d['text'] a.insert(0, span) li.append(a) div_ul.append(li) # check for an existing div#image-list) nodes = CSSSelector('div#image-list')(root) if nodes: # replace it nodes[0].replace(nodes[0], div_images) else: # add to end of document body = root.xpath('//body')[0] body.append(div_images)
def parse_highlights_html(html): node = lxml.html.fromstring(html) useless_nodes = (list(CSSSelector('.addEditNote')(node)) + list(CSSSelector('.editNote')(node)) + list(CSSSelector('script')(node)) + list(CSSSelector('#header')(node)) + list(CSSSelector('form')(node))) for useless in useless_nodes: useless.getparent().remove(useless) return parse_element_into_books( CSSSelector('#allHighlightedBooks')(node)[0])
def getkleague(): print '<Get Kleague>' r = requests.get('http://www.kleague.com/') html = lxml.html.fromstring(r.text) sel = CSSSelector('div#modal_classic-team-rank table tbody tr') nodes = sel(html) print u"테이블 행 갯수: ", len(nodes) counter = 0 for teams in nodes: for cols in teams: if cols.xpath('.//a/text()'): print cols.xpath('.//a/text()')[0], elif cols.xpath('.//span/text()'): print cols.xpath('.//span/text()')[0], else: print cols.text.strip(), print
def on_sale(tree): ''' a separate element, when present. this data gets updated daily, so could change. and if we're collecting price, it would be incomplete of us not to check this Args: the parsed tree element of a product page Returns: if on sale: a string containing the sale price ''' sale_element = CSSSelector('ul li.newsFont b font')(tree) if len(sale_element) > 0: return sale_element[0].text else: return False
def on_process(self): self.text_edit_output.clear() self.label_error.clear() self.button_detail_error.hide() self.last_error_message = None self.last_detail_error_message = None try: text = self.text_edit_input.toPlainText() if not text or not self.le_xpath_css.text(): return search_text = self.le_xpath_css.text() is_html_parser = self.rb_parser_html.isChecked() if is_html_parser: root = html.fromstring(text) else: root = etree.fromstring(text) if self.rb_xpath.isChecked(): result = root.xpath(search_text) else: selector = CSSSelector(search_text, translator='html' if is_html_parser else 'xml') result = selector(root) print(len(result), result) result = map(to_str, result) output = '\n'.join('{}. {}'.format(i, x) for i, x in enumerate(result, 1)) self.text_edit_output.setPlainText(output) except Exception as e: # # Выводим ошибку в консоль # traceback.print_exc() # Сохраняем в переменную tb = traceback.format_exc() self.last_error_message = str(e) self.last_detail_error_message = str(tb) self.button_detail_error.show() self.label_error.setText('Error: ' + self.last_error_message)
def pullup_elems(tree, loader_context): for elem_child, parent_dist in loader_context.get("pullup_elems", {}).items(): selector = CSSSelector(elem_child) for elem in selector(tree): parent = elem for _ in range(parent_dist): parent = parent.getparent() if parent is not None and parent.getparent() is not None: elem.tail = parent.tail parent.getparent().replace(parent, elem) else: logger.error( 'Could not find parent with distance {} for selector "{}".' .format(parent_dist, elem_child)) return [tree]
def getImages(_htmlobj, path, _base): sel = CSSSelector("img") htmlobj = _htmlobj images = sel(htmlobj) for image in images: originalsrc = image.attrib["src"] if validImageFormat(originalsrc): if _base != "": fileUrl = getFileUrl(_base, originalsrc) else: fileUrl = getFileUrl(htmlobj.base, originalsrc) fileName = path + fileUrl.split('/')[-1].replace('%20', '_') staticUrl = '/static/' + fileName.split('/')[-1] saveFile(fileUrl, fileName) image.attrib['src'] = staticUrl return htmlobj
def check_for_unicorn(tree): ''' gives us a quick thumbs-up or thumbs-down on whether each product page contains a unicorn. Args: the parsed DOM tree of a product page ''' check_unicorn = CSSSelector('body table tr td table tr td font')(tree) is_unicorn = False try: if 'one Location' in check_unicorn[0].text: is_unicorn = True return is_unicorn except IndexError: print 'no text to check here? that would be weird.' print[c.text for c in check_unicorn]
def getCoinInfo(coin): url = 'https://www.feixiaohao.com/currencies/' + coin ret = requests.get(url) tree = html.fromstring(ret.text) h24 = tree.xpath( '//*[@id="baseInfo"]/div[1]/div[1]/div[3]/div[1]/span/text()')[0] l24 = tree.xpath( '//*[@id="baseInfo"]/div[1]/div[1]/div[3]/div[2]/span/text()')[0] percentage = tree.xpath( '//*[@id="baseInfo"]/div[1]/div[2]/div[5]/div/span/text()')[0] flowRate = tree.xpath( '//*[@id="baseInfo"]/div[1]/div[3]/div[5]/div/span/text()')[0] turnoverRate = tree.xpath( '//*[@id="baseInfo"]/div[1]/div[4]/div[5]/div/span/text()')[0] icoInfo = tree.xpath('/html/body/div[5]/div/div[6]/table') info = {} if len(icoInfo) > 0: sel = CSSSelector('tr td') tds = sel(icoInfo[0]) vals = [ "status", "platform", "icoDistribute", "investPercentage", "icoTotal", "icoSupply", "icoStart", "icoStop", "icoOpenPrice", "icoMethod", "icoTarget", "icoVolume", "icoAveragePrice", "icoSuccessCount", "icoSuccessVolume", "characteristic", "security", "law", "area", "consultant", "sellAgentAddress", "blogAddress" ] for index, td in enumerate(tds): if index < len(vals): text = "" if td.text is None else td.text info[vals[index]] = text result = { "coin": coin, "h24": h24, "l24": l24, "percentage": percentage, "flowRate": flowRate, "turnoverRate": turnoverRate, "info": str(info) } return result
def prepare_text(self, obj): """Return text for indexing. Args: obj (GeneralPage): Object for indexing. Returns: String for indexing. """ rendered = render_to_string(obj.template, {"LANGUAGE_CODE": "en"}) html = fromstring(rendered) selector = CSSSelector("#general-page-content") try: contents = selector(html)[0].text_content() except IndexError: raise TemplateSyntaxError(CONTENT_NOT_FOUND_ERROR_MESSAGE) return contents
def scrappWeb(url, filter, path, base): try: webdata = urllib2.urlopen(url) except: return url webstring = webdata.read() if filter == u'': return webstring #nyapa da morte for the android course. try: htmlobj = html.fromstring( webstring.replace(u'font-size: 12px', u'font-size: 12pt')) sel = CSSSelector(filter) htmlobj = getImages(sel(htmlobj)[0], path, base) return html.tostring(sel(htmlobj)[0]) except: return ""
def getPageInfos(num): url = 'https://www.feixiaohao.com/exchange' if num > 1: url = url + '/list_' + str(num) + '.html' ret = requests.get(url) tree = html.fromstring(ret.text) trs = tree.xpath('/html/body/div[5]/div/div[1]/div[3]/table/tbody/tr') infos = [] for tr in trs: info = {} tradeTypes = [] sel = CSSSelector('td a') tds = sel(tr) for index, td in enumerate(tds): if td.text is not None: if index == 1: info['h24Volume'] = td.text elif index == 2: info['marketNum'] = td.text elif index == 3: info['country'] = td.text if len(td.cssselect('a img')) > 0: img = td.cssselect('a img')[0] info['icon'] = img.get('src') href = td.cssselect('a')[0].get('href') info['code'] = href[10:len(href) - 1] aElm = td.cssselect('a') if len(aElm) > 0: for elm in aElm: href = elm.get('href') pos = href.find('type=') if pos != -1: tradeTypes.append(href[16:]) info['tradeTypes'] = ','.join(tradeTypes) infos.append(info) for info in infos: setExInfo(info) return infos
def replaceUnresolvedLinks(root): """ This transformation replaces all a.external-link nodes with a proper footnote. Used for PDF generation only (html_mode = 'split') """ for link in CSSSelector('a.external-link')(root): href = link.attrib['href'] span1 = lxml.html.Element('span') span1.attrib['class'] = 'generated-footnote-text' span1.text = link.text_content() span2 = lxml.html.Element('span') span2.attrib['class'] = 'generated-footnote' span2.text = href span1.insert(1, span2) link.getparent().replace(link, span1)
def get_pagination(self, server, url): resp = server.get(url) tree = lxml.html.fromstring(resp.text) sel = CSSSelector('#SDTOPDESTCONTENT > div.deckTools.btm > div > div > :last-child') pageNum = sel(tree) if len(pageNum) != 0: count_of_page = pageNum[0].get('data-page-number') else: count_of_page = 1 page_numbers = range(0, int(count_of_page), 1) for page in page_numbers: link_number = int(page) * 20 return link_number
def get_submission_dates(self, arxiv_tree, queried_version): links = CSSSelector("div.submission-history")(arxiv_tree)[0] #print("links are", links) versions = {} blob = self.clean_gunky_arxiv_data(links.text_content()) #print( "Parsing", blob) for line in blob.split("\n"): match = self.version_re.match(line) if match: version, d = match.group(1), match.group(2) d = datetime.datetime.strptime(d, '%a, %d %b %Y').date() versions[version] = d if queried_version == version: return {version: d} #print(version, date) return versions
def _get_post_images(post_page): """Scrape the images from a post""" images = CSSSelector(".postbody img")(post_page) # filter out forum images that are not part of the post images = [ img for img in images if "smilies" not in img.get("src") and "styles" not in img.get("src") ] # add base url for relative urls images = [ _forum_url + img.get("src")[2:] if img.get("src")[0:2] == "./" else img.get("src") for img in images ] return images
def _selector_query_found(self, bodies, selector): selector = selector.split(':')[0] if '}' in selector: # XXX does this ever happen any more? return for body in bodies: try: for _ in CSSSelector(selector)(body): return True except SelectorSyntaxError: print('TROUBLEMAKER', file=sys.stderr) print(repr(selector), file=sys.stderr) except ExpressionError: print('EXPRESSIONERROR', file=sys.stderr) print(repr(selector), file=sys.stderr) return False
def book_keywords(obj): html = obj.getText() if not html: return [] if isinstance(html, str): html = html.decode('utf-8') doc = lxml.html.parse(StringIO(html)) nodes = doc.xpath(CSSSelector('span.keyword').path) keywords = [] for node in nodes: if not node.attrib.get('title'): continue keywords.append(node.attrib['title'].encode('utf-8').strip()) return keywords
def get_balance(self): self.open_with_retry(self.LOGIN_URL, u'login') logging.debug(u'Filling login form') self.browser.select_form(nr=0) self.browser[u'username'] = CONFIG[u'odesk'][u'username'] self.browser[u'password'] = CONFIG[u'odesk'][u'password'] logging.debug(u'Submitting login form') self.browser.submit() page = self.open_with_retry(self.ACCOUNT_URL, u'account') logging.debug(u'Parsing page and extracting balance') html = etree.HTML(page) raw_balance = CSSSelector(u'div.oMain div.oTxtMega span.oPos')(html)[0].text return Currency(self.string_to_decimal(raw_balance), u'USD')
def extract_jokes(): result = [] url = 'https://bestlifeonline.com/bad-funny-puns/' response = requests.get(url) sel = CSSSelector('.content li') html_elmnts = html.fromstring(response.content) for joke in sel(html_elmnts): result.append({ 'joke': joke.text_content().strip(), 'score': None, 'categories': ['Pun'] }) return result
def extract_jokes(page): result = [] url = 'https://www.rd.com/jokes/puns/page/{}/' response = requests.get(url.format(page)) sel = CSSSelector('.excerpt-wrapper') html_elmnts = html.fromstring(response.content) for joke in sel(html_elmnts): result.append({ 'joke': joke.text_content().strip(), 'score': None, 'categories': ['Pun'] }) return result
def convert_iframes(tree, loader_context): """Convert iframes to divs with links to its src. convert_iframes() is called after remove_elems() so that unwanted iframes can be eliminated first. """ base_url = loader_context.get("base_url", None) if loader_context else None selector = CSSSelector("iframe") for elem in selector(tree): if "src" not in elem.attrib: continue url = urljoin(base_url, elem.attrib.pop("src")) elem_new = lxml.html.fragment_fromstring( '<div><a href="{url}">{url}</a></div>'.format(url=url)) elem_new.tail = elem.tail elem.getparent().replace(elem, elem_new) return [tree]
def extract_jokes(): result = [] url = 'https://www.sunnyskyz.com/funny-jokes/221/58-Funny-Puns-You-Can-t-Wait-To-Use' response = requests.get(url) sel = CSSSelector('#picofday p') html_elmnts = html.fromstring(response.content) for joke in sel(html_elmnts): result.append({ 'joke': joke.text_content().strip(), 'score': None, 'categories': ['Pun'] }) return result
def url_selector_values(url, selector, attr): """Provide attribute list of type `attr` from `selector` match at `url`. If an attribute is not present for a matched element, `None` is added to the returned list. Common use: >>> url_selector_values(my_url, 'span a', 'href') ['a.html', 'b.html', 'c.html'] >>> url_selector_values(my_url, 'span a', 'rel') [None, None, 'external'] >>> url_selector_values(my_url, 'span a', 'foo') [None, None, None] >>> """ tree = etree.parse(urllib2.urlopen(url), parser) cs = CSSSelector(selector) return [branch.get(attr) for branch in cs(tree)]
def init(): global meta_selectors global prop2selectors global sel2value_attr global bad_chars global website_pattern for prop, sels in meta_selectors.iteritems(): objs = [] for sel in sels: sel_obj = CSSSelector(sel) objs.append(sel_obj) if 'link' in sel: sel2value_attr[sel_obj] = 'href' elif 'meta' in sel: sel2value_attr[sel_obj] = 'content' else: sel2value_attr[sel_obj] = None prop2selectors[prop] = objs
def get_week_names(): all_weeks = dict() for year in range(1922, 2021): print(year) url = f'{base_url}/years/{year}/week_1.htm' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') sel = CSSSelector('a') h = html.fromstring(response.text) weeks = { f'{year} Week {r["href"].split("_")[1].replace(".htm", "")}': f'{year} {r.text}' for r in soup.find_all('a', href=True) if f'/years/{year}/week' in r['href'] } all_weeks = {**all_weeks, **weeks} with open('data/weeks.json', 'w') as f: f.writelines(json.dumps(all_weeks))
def scrape_and_look_for_next_link(url): html = scraperwiki.scrape(url) html = re.sub('\<a(?:.*?)resultsSectionLabel(?:.*?)/a\>','',html) # print html root = lxml.html.fromstring(html) sel = CSSSelector('div.cat a') links = [a.get('href') for a in sel(root)] print links # print links scrape_links(links) scraperwiki.sqlite.save_var('last_link', links[0]) scraperwiki.sqlite.save_var('dclastlink', links[0]) next_link = root.cssselect("a.next") # print next_link if next_link: next_url = urlparse.urljoin(base_url, next_link[0].attrib.get('href')) # print next_url scrape_and_look_for_next_link(next_url)
def find_language_rows(content: str, language: str): html_content = fromstring(content) language_selector = CSSSelector(f"span.{LANGUAGE_CSS_CLASS}") language_cells = language_selector(html_content) rows = [] for cell in language_cells: language_text = cell.text_content().lower() if "/" not in language_text: if language == language_text: row = cell.getparent().getparent() rows.append(row) else: if language in language_text: row = cell.getparent().getparent() rows.append(row) return rows