def get_selected_items(self): response = self.session.get(self.url("selected_items")) tree = lxml.html.fromstring(response.text) item_sel = CSSSelector('div[headers="th_selected_items"]') name_sel = CSSSelector("h4.il_ContainerItemTitle") icon_sel = CSSSelector("img.ilListItemIcon") results = item_sel(tree) for result in results: item = Item() name = name_sel(result)[0] try: name = CSSSelector("a")(name)[0] except IndexError: pass item.name = name.text item.url = name.get("href") icon = icon_sel(result)[0] item.icon = icon.get("src") yield item
def _fetch_from_cache(language, url): from . import utils cms_url = utils.get_cms_url(language, url) if cms_url in cache: html = cache.get(cms_url) else: html = utils.get_cms_page(language, url) cache.set(cms_url, html) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser).getroot() toc = CSSSelector('.toc') # Removing all table of contents for table in toc(tree): table.getparent().remove(table) title = CSSSelector('.page-title')(tree)[0] title.getparent().remove(title) elements = list(CSSSelector('.cms-content')(tree)[0]) headers = [i for i, e in enumerate(elements) if CSSSelector('.section-header')(e)] title_icons = list(CSSSelector('.title-icon')(tree)) page_contents = [] for i, h in enumerate(headers): icon = "" if i < len(title_icons) and 'src' in title_icons[i].attrib: icon = title_icons[i].attrib['src'] element = elements[h] if (i + 1) == len(headers): contents = elements[h + 1:] else: contents = elements[h + 1:headers[i + 1]] for e in elements: if 'dir' in e.attrib: del e.attrib['dir'] section_title = CSSSelector('a[name]')(element)[0].text section_body = "" for c in contents: section_body += etree.tostring(c, pretty_print=True, method="html") page_contents.append({ "is_important": True if CSSSelector('.important')(element) else False, "title": section_title, "body": section_body, "icon": icon }) return { "title": title.text, "contents": page_contents }
def process_html(self, html, path): parser = etree.HTMLParser(encoding='utf-8') tree = etree.fromstring(html.decode('utf-8'), parser).getroottree() page = tree.getroot() if page is None: print(repr(html)) raise ParserError('Could not parse the html') lines = html.splitlines() body, = CSSSelector('body')(page) self._bodies.append(body) if self.optimize_lookup: for each in body.iter(): identifier = each.attrib.get('id') if identifier: self._all_ids.add(identifier) classes = each.attrib.get('class') if classes: for class_ in classes.split(): self._all_classes.add(class_) for style in CSSSelector('style')(page): first_line = style.text.strip().splitlines()[0] for i, line in enumerate(lines): if line.count(first_line): key = (i + 1, path) self.blocks[key] = style.text break
def detect_withdrawn(self, tree, url): comment = CSSSelector(".tablecell.comments")(tree) if comment: comment = comment[0].text_content() if "withdrawn" in comment.lower(): print("Paper", url, "appears to be withdrawn!") return True return False
def post_node(title, datetime, content): post = copy(POST) CSSSelector('.title .text')(post)[0].text = title CSSSelector('.datetime')(post)[0].text = datetime.strftime("%H:%M on %A the %%s of %B, %Y") % niceday(datetime) content_css = CSSSelector('.content')(post)[0] for fragment in fragments_fromstring(cleaner_trusted.clean_html(content)): content_css.append(fragment) return post
def get_or_create_head(root): """Ensures that `root` contains a <head> element and returns it. """ head = CSSSelector('head')(root) if not head: head = etree.Element('head') body = CSSSelector('body')(root)[0] body.getparent().insert(0, head) return head else: return head[0]
def process_html(self, html, url): parser = etree.HTMLParser(encoding='utf-8') tree = etree.fromstring(html.encode('utf-8'), parser).getroottree() page = tree.getroot() if page is None: print(repr(html)) raise ParserError('Could not parse the html') lines = html.splitlines() body, = CSSSelector('body')(page) self._bodies.append(body) if self.optimize_lookup: for each in body.iter(): identifier = each.attrib.get('id') if identifier: self._all_ids.add(identifier) classes = each.attrib.get('class') if classes: for class_ in classes.split(): self._all_classes.add(class_) for style in CSSSelector('style')(page): try: first_line = style.text.strip().splitlines()[0] except IndexError: # meaning the inline style tag was just whitespace continue except AttributeError: # happend when the style tag has absolute nothing it # not even whitespace continue for i, line in enumerate(lines): if line.count(first_line): key = (i + 1, url) self.blocks[key] = style.text break for link in CSSSelector('link')(page): if ( link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith('.css') ): link_url = self.make_absolute_url(url, link.attrib['href']) key = (link_url, link.attrib['href']) self.blocks[key] = self.download(link_url) if self.preserve_remote_urls: self.blocks[key] = self._rewrite_urls( self.blocks[key], link_url )
def get_submission_dates(self, arxiv_tree, queried_version): links = CSSSelector("div.submission-history")(arxiv_tree)[0] versions = {} #print "Parsing", links.text_content() for line in links.text_content().split("\n"): match = self.version_re.match(line) if match: version, d = match.group(1), match.group(2) d = datetime.datetime.strptime(d,'%a, %d %b %Y').date() versions[version] = d if queried_version == version: return {version: d} #print version, date return versions
def process_html(self, html, url): parser = etree.HTMLParser() tree = etree.fromstring(html, parser).getroottree() page = tree.getroot() if page is None: print repr(html) raise ParserError("Could not parse the html") lines = html.splitlines() body, = CSSSelector('body')(page) self._bodies.append(body) if self.optimize_lookup: for each in body.iter(): id = each.attrib.get('id') if id: self._all_ids.add(id) classes = each.attrib.get('class') if classes: for class_ in classes.split(): self._all_classes.add(class_) for style in CSSSelector('style')(page): first_line = style.text.strip().splitlines()[0] for i, line in enumerate(lines): if line.count(first_line): key = (i + 1, url) self.blocks[key] = style.text break for link in CSSSelector('link')(page): if ( link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith('.css') ): link_url = self.make_absolute_url(url, link.attrib['href']) key = (link_url, link.attrib['href']) self.blocks[key] = self._download(link_url) if self.preserve_remote_urls: self.blocks[key] = self._rewrite_urls( self.blocks[key], link_url )
def load_stations(file="stations-converted.json"): global STATIONS with open(file) as f: STATIONS = anyjson.deserialize(f.read()) for station in STATIONS.values(): try: uri = "http://hydro.chmi.cz/isarrow/object.php?seq=2000855701&chemie=1&biota=1&ukol_p=1&id_objekt=&vod_typ=R&nadmh_sign=%3E&rickm_sign=%3E&rok_od=2007&rok_do=2012&objekty_chemdata=1&matrice=2000868184&typodb=41" seq = CSSSelector("form input[name='seq']")(fromstring(urllib2.urlopen(uri).read().decode("cp1250")))[ 0 ].value # print 'seq is ' + seq uri = ( "http://hydro.chmi.cz/isarrow/object.php?agenda=POV&objekty_chemdata=1&objekty_biodata=&taxon_tree=&seq=" + seq + "&data_sel=chemdata&chemie=1&biota=1&rok_od=2007&rok_do=2012&matrice=2000868184&typodb=41&tscongrp=&tscon=&data_mez_stanovitelnosti=&data_od=&data_do=&taxon=&send=Chemick%E9+vzorky" ) tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250")) link = CSSSelector("table.tbl a")(tree)[-1] uri = "http://hydro.chmi.cz/isarrow/" + link.get("href") tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250")) csv_link = tree.xpath("//form[1]//a")[0] uri = "http://hydro.chmi.cz/isarrow/" + csv_link.get("href") # FIXME: CSV export is now broken on IS ARROW # wait for them to fix it or parse from table -- and store relevant data into structure reader = csv.reader(urllib2.urlopen(uri)) for row in reader: print row except Exception: print "Failed to retrieve values for station " + station["id"] import traceback traceback.print_exc()
def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]: # Convert the emoji spans to img tags. classes = emoji_span_elem.get('class') match = re.search('emoji-(?P<emoji_code>\S+)', classes) # re.search is capable of returning None, # but since the parent function should only be called with a valid css element # we assert that it does not. assert match is not None emoji_code = match.group('emoji_code') emoji_name = emoji_span_elem.get('title') alt_code = emoji_span_elem.text image_url = base_url + '/static/generated/emoji/images-%(emojiset)s-64/%(emoji_code)s.png' % { 'emojiset': emojiset, 'emoji_code': emoji_code } img_elem = lxml.html.fromstring( '<img alt="%(alt_code)s" src="%(image_url)s" title="%(title)s">' % { 'alt_code': alt_code, 'image_url': image_url, 'title': emoji_name, }) img_elem.set('style', 'height: 20px;') img_elem.tail = emoji_span_elem.tail return img_elem
# coding: utf-8 import lxml.html import requests from lxml.cssselect import CSSSelector keyword = '비오는' r = requests.get("http://music.naver.com/search/search.nhn?query=" + keyword + "&x=0&y=0") _html = lxml.html.fromstring(r.text) sel = CSSSelector('table[summary] > tbody > ._tracklist_move') # Apply the selector to the DOM tree. nodes = sel(_html) _selName = CSSSelector('.name > a.title') _selArtist = CSSSelector('._artist.artist') _selAlbum = CSSSelector('.album > a') for node in nodes: #print lxml.html.tostring(item) _name = _selName(node) _artist = _selArtist(node) _album = _selAlbum(node) if _name: print _artist[0].text_content().strip(), print "---", print _name[0].text_content(), print "---", print _album[0].text_content()
def csstext(target, selector): from lxml.cssselect import CSSSelector return ' '.join(e.text_content() for e in CSSSelector(selector)(target)).strip()
def _parse_html_for_content(html): """ This function takes in the HTML from transifex and looks for the special tags that break down the anchors into two separate divs see function above :param html: :return: """ p = re.compile(r'<.*?>') if p.findall(html): h = html_parser.HTMLParser() parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) a = CSSSelector('div.former-anchor') translatable_a = CSSSelector('div.former-anchor-translatable') img = CSSSelector('div.former-image') phones = CSSSelector('div.former-tel') anchors = a(tree) for anchor in anchors: attributes = [(k.replace('data-a-', ''), h.unescape(v)) for k, v in dict(anchor.attrib).iteritems() if 'data-a-' in k] ht_st = "<a>{}</a>".format(stringify_children(anchor)) div = etree.parse(StringIO(fix_html_fragment(ht_st))).getroot() for k, v in attributes: div.attrib[k] = v swap_element_inbound(div, anchor) anchors = translatable_a(tree.getroot()) for anchor in anchors: attributes = [(k.replace('data-a-', ''), h.unescape(v)) for k, v in dict(anchor.attrib).iteritems() if 'data-a-' in k] content = etree.Element('div') link = etree.Element('div') for c in anchor: if 'class' in c.attrib: if c.attrib['class'] == 'text': content = c if c.attrib['class'] == 'href': link = c ht_st = "<a>{}</a>".format(stringify_children(content)) div = etree.parse(StringIO(fix_html_fragment(ht_st))).getroot() for k, v in attributes: div.attrib[k] = v href = stringify_children(link) if href: div.attrib['href'] = h.unescape(href) swap_element_inbound(div, anchor) images = img(tree.getroot()) for image in images: attributes = [(k.replace('data-img-', ''), h.unescape(v)) for k, v in dict(image.attrib).iteritems() if 'data-img-' in k] div = etree.Element('img') for k, v in attributes: div.attrib[k] = h.unescape(v) swap_element_inbound(div, image) tels = phones(tree.getroot()) for tel in tels: if 'class' in tel.attrib: classes = tel.attrib['class'].split(' ') tag_format = "{}" if 'has-b' in classes: tag_format = "<b>{}</b>".format(tag_format) if 'has-u' in classes: tag_format = "<u>{}</u>".format(tag_format) if 'has-strong' in classes: tag_format = "<strong>{}</strong>".format(tag_format) if 'has-em' in classes: tag_format = "<em>{}</em>".format(tag_format) if 'has-i' in classes: tag_format = "<i>{}</i>".format(tag_format) tag_format = "<span class=\"tel\">{}</span>".format(tag_format) div = etree.parse( StringIO(tag_format.format( tel.attrib['data-tel-number']))).getroot() swap_element_inbound(div, tel) html = etree.tostring(tree) soup = BeautifulSoup(html) return unicode(soup.prettify())
def pull_from_transifex(slug, language, project=settings.TRANSIFEX_PROJECT_SLUG, retry=True): from django.contrib.auth import get_user_model User = get_user_model() # cache.add fails if the key already exists acquire_lock = lambda: cache.add('publishing-translation', 'true', 60 * 5) # memcache delete is very slow, but we have to use it to take # advantage of using add() for atomic locking release_lock = lambda: cache.delete('publishing-translation') try: if language == 'en': return import cms.api internal_language = language if language not in SHIM_LANGUAGE_DICTIONARY else SHIM_LANGUAGE_DICTIONARY[ language] while True: if acquire_lock(): break time.sleep(5) staging = Title.objects.filter(language='en', slug='staging') if staging: staging = staging[0].page titles = Title.objects.filter(language='en', slug=slug, page__in=staging.get_descendants()) if not titles: logger.info('Page not found. Ignoring.') page = titles[0].page.get_draft_object() password = settings.TRANSIFEX_PASSWORD user = settings.TRANSIFEX_USER transifex_language = language transifex_url_data = { "project": project, "slug": page.get_slug('en'), "language": transifex_language } fetch_format = "http://www.transifex.com/api/2/project/{project}/resource/{slug}html/translation/{language}/?mode=default" logger.info("Trying to request: %s" % fetch_format.format(**transifex_url_data)) logger.info("With creds: %s %s" % (user, password)) r = requests.get(fetch_format.format(**transifex_url_data), auth=(user, password)) translation = r.json() text = translation['content'].strip() text = _parse_html_for_content(text) soup = BeautifulSoup(text) parser = etree.HTMLParser() tree = etree.parse(StringIO(unicode(soup.prettify())), parser) selector = CSSSelector('div[data-id]') title_selector = CSSSelector('div.title') """ Directions are handled application-wise """ dir_selector = CSSSelector('[dir]') for element in dir_selector(tree.getroot()): del element.attrib['dir'] content = selector(tree.getroot()) title = title_selector(tree.getroot()) if title: try: title = title[0].text title_obj = page.get_title_obj(internal_language, fallback=False) if type(title_obj).__name__ == 'EmptyTitle': logger.info('Creating new title') en_title_obj = page.get_title_obj('en') title_obj = cms.api.create_title( language=internal_language, title=en_title_obj.title.strip(), page=page, slug=en_title_obj.slug.strip(), ) title_obj.save() title_obj.page_title = title.strip() title_obj.save() except Exception as e: logger.exception('Error updating the application.') dict_list = [] for div in content: plugin_dict = { 'id': div.attrib['data-id'], 'type': div.attrib['data-type'], 'parent': div.attrib['data-parent'], 'position': div.attrib['data-position'], 'translated': (div.text or '') + u''.join([ etree.tostring(a, pretty_print=True, method="html") for a in div ]), } dict_list.append(plugin_dict) blame = User.objects.filter(is_staff=True, is_superuser=True)[0] _translate_page(dict_list, internal_language, page) cms.api.publish_page(page, blame, internal_language) except Exception as e: if retry: time.sleep(5) pull_from_transifex.delay(slug, language, project, False) else: traceback.print_exc() logger.info('Tried to retry it but it still erred out.') raise e finally: release_lock()
def extract_reply_cids(html): tree = lxml.html.fromstring(html) sel = CSSSelector('.comment-replies-header > .load-comments') return [i.get('data-cid') for i in sel(tree)]
import lxml.html from lxml.cssselect import CSSSelector import requests url = requests.get('http://www.google.com/search?q=python') tree = lxml.html.fromstring(url.text) link = CSSSelector('a[href]') lines = link(tree) print len(save) for line in lines: save = lxml.html.tostring(line) if 'https://' in save: print save print
def split_html(html_filename, split_at_level=0): """ Split aggregated and rendered HTML document at some <hX> tag(s). split_at_level=0 -> split at H1 tags, split_at_level=1 -> split at H1 and H2 tags. Returns a list of dicts with keys 'html' referring to the subdocument and 'level' indicating the split point. """ destdir = os.path.dirname(html_filename) soup = BeautifulSoup(file(html_filename).read()) fp = StringIO(soup.__str__(prettyPrint=True)) docs = list() current_doc = list() for line in fp: line = line.rstrip() for level in range(split_at_level + 1): if '<h%d' % (level + 1) in line.lower(): html = '\n'.join(current_doc) root = lxml.html.fromstring(unicode(html, 'utf-8')) title = u'' h1_nodes = root.xpath('//h1') if h1_nodes: title = h1_nodes[0].text_content().strip() # count tables and images number_tables = len(root.xpath('//table')) number_images = len(CSSSelector('div.image-caption')(root)) # find all linkable nodes with an ID attribute node_ids = list() for node in root.xpath('.//*'): node_id = node.get('id') if node_id: node_ids.append(node_id) html = lxml.html.tostring(root, encoding=unicode) docs.append( dict(html=html, level=level, title=title, node_ids=node_ids, number_images=number_images, number_tables=number_tables)) current_doc = [] break current_doc.append(line) # now deal with the remaining part of the document html = '\n'.join(current_doc) root = lxml.html.fromstring(unicode(html, 'utf-8')) title = u'' h1_nodes = root.xpath('//h1') if h1_nodes: title = h1_nodes[0].text_content().strip() # count tables and images # count tables and images number_tables = len(root.xpath('//table')) number_images = len(CSSSelector('div.image-caption')(root)) # find all linkable nodes with an ID attribute node_ids = list() for node in root.xpath('.//*'): node_id = node.get('id') if node_id: node_ids.append(node_id) html = lxml.html.tostring(root, encoding=unicode) docs.append( dict(html=html, level=0, title=title, node_ids=node_ids, number_images=number_images, number_tables=number_tables)) # now store files on the filesystem ini_filename = os.path.join(destdir, 'documents.ini') fp_ini = codecs.open(ini_filename, 'w', 'utf-8') for count, d in enumerate(docs[1:]): filename = os.path.join( destdir, 'split-0/%d-level-%d.html' % (count, d['level'])) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) file(filename, 'w').write(d['html'].encode('utf-8')) print >> fp_ini, '[%d]' % count print >> fp_ini, 'filename = %s' % filename print >> fp_ini, 'title = %s' % d['title'] print >> fp_ini, 'number_tables= %d' % d['number_tables'] print >> fp_ini, 'number_images = %d' % d['number_images'] print >> fp_ini, 'node_ids = ' for node_id in d['node_ids']: print >> fp_ini, ' ' + node_id print >> fp_ini fp_ini.close() return docs[1:]
def getView(document, css): """ document a DOM document, currently an lxml HTML document css a CSS StyleSheet string returns style view a dict of {DOMElement: css.CSSStyleDeclaration} for html """ from lxml.cssselect import CSSSelector sheet = cssutils.parseString(css) view = {} specificities = {} # needed temporarily # TODO: filter rules simpler?, add @media rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) for rule in rules: for selector in rule.selectorList: #log(0, 'SELECTOR', selector.selectorText) # TODO: make this a callback to be able to use other stuff than lxml try: cssselector = CSSSelector(selector.selectorText) except: continue matching = cssselector.evaluate(document) for element in matching: #if element.tag in ('div',): # add styles for all matching DOM elements #log(1, 'ELEMENT', id(element), element.text) if element not in view: # add initial empty style declatation view[element] = cssutils.css.CSSStyleDeclaration() # @UndefinedVariable specificities[element] = {} for p in rule.style: # update style declaration if p not in view[element]: # setProperty needs a new Property object and # MUST NOT reuse the existing Property # which would be the same for all elements! # see Issue #23 view[element].setProperty(p.name, p.value, p.priority) specificities[element][p.name] = selector.specificity #log(2, view[element].getProperty('color')) else: #log(2, view[element].getProperty('color')) sameprio = (p.priority == view[element].getPropertyPriority(p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[element][p.name]): # later, more specific or higher prio view[element].setProperty(p.name, p.value, p.priority) return view
lasturl = "" while True: req = urllib2.Request(url) req.add_header("User-Agent", useragent) if lasturl: req.add_header("Referer", lasturl) html = unicode(urllib2.urlopen(req).read(), errors="ignore") doc = etree.HTML(html) rtr = CSSSelector("ol#rtr")(doc) if rtr: numresults = len(rtr[0].getchildren()) else: numresults = 0 print "hit " + url + " got " + str(numresults) + " results" rhscol = CSSSelector("div#rhscol")(doc)[0] links = [a for a in rhscol.getiterator("a")] if len(links) != 3 or "Older" not in links[1].text or "Newer" not in links[2].text: print "Cant find older and newer links here, backing up" oldurl = page.url match = re.search("mbl_hs:(\d+),mbl_he:(\d+),mbl_rs:(\d+),mbl_re:(\d+)", oldurl) mbl_hs = int(match.group(1)) + 600 mbl_he = int(match.group(2)) + 600 mbl_rs = int(match.group(3)) + 600 mbl_re = int(match.group(4)) + 600 url = oldurl.replace( match.group(0), "mbl_hs:" + str(mbl_hs) + ",mbl_he:" + str(mbl_he) + ",mbl_rs:" + str(mbl_rs) + ",mbl_re:" + str(mbl_re), ) lasturl = oldurl time.sleep(10)
document = etree.HTML(html) e = etree.Element('pre', {'class': 'cssutils'}) e.text = css document.find('body').append(e) sheet = cssutils.parseString(css) view = {} specificities = {} # temporarily needed # TODO: filter rules simpler?, add @media rules = (rule for rule in sheet.cssRules if rule.type == rule.STYLE_RULE) for rule in rules: for selector in rule.selectorList: cssselector = CSSSelector(selector.selectorText) elements = cssselector.evaluate(document) for element in elements: # add styles for all matching DOM elements if element not in view: # add initial view[element] = cssutils.css.CSSStyleDeclaration() specificities[element] = {} for p in rule.style: # update styles if p not in view[element]: view[element].setProperty(p) specificities[element][p.name] = selector.specificity else: sameprio = (p.priority ==
sel = CSSSelector('table tbody tr') rows = sel(tree) print "Row results: ", len(rows) num_operating = 0 for row in rows: # This is unrealiable; I don't know how to get just the text 'Operating': # phase = CSSSelector('td:nth-of-type(4)')(row)[0] # lxml.html.tostring(phase) # '<td><span class="hide">3</span>Operating</td>' phase = CSSSelector('td:nth-of-type(4)')(row)[0].text_content() # '3Operating' if 'Operating' in phase: num_operating += 1 # Show phase because we may have stale ones in iSat division = CSSSelector('td:nth-of-type(1)')(row)[0] try: division = division.text.strip() except AttributeError, e: division = 'NOTFOUND' mission = CSSSelector('td:nth-of-type(2) > a')(row)[0] mission_name = mission.text.strip() mission_url = mission.get('href') # /missions/xmm-newton/ mission_slug = mission_url.split('/')[2] num_operating += 1 try: print '%-30s\t%-40s\t%-20s\t%-20s' % (mission_slug, mission_name.encode('ascii', 'ignore'), division, phase) except UnicodeEncodeError, e: print "F*****g unicode problem: ", e import pdb; pdb.set_trace() print 'Operating:', num_operating
def getView(self, document, sheet, media='all', name=None, styleCallback=None): """ document a DOM document, currently an lxml HTML document sheet a CSS StyleSheet object, currently cssutils sheet media: optional TODO: view for which media it should be name: optional TODO: names of sheets only styleCallback: optional should return css.CSSStyleDeclaration of inline styles, for html a style declaration for ``element@style``. Gets one parameter ``element`` which is the relevant DOMElement returns style view a dict of {DOMElement: css.CSSStyleDeclaration} for html """ styleCallback = styleCallback or self.styleattribute _unmergable_rules = CSSStyleSheet() view = {} specificities = {} # needed temporarily # TODO: filter rules simpler?, add @media rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) for rule in rules: for selector in rule.selectorList: self.log(0, 'SELECTOR', selector.selectorText) # TODO: make this a callback to be able to use other stuff than lxml try: cssselector = CSSSelector(selector.selectorText) except (ExpressionError, NotImplementedError) as e: _unmergable_rules.add(CSSStyleRule(selectorText=selector.selectorText, style=rule.style)) continue matching = cssselector.evaluate(document) for element in matching: if element.tag in self.NONVISUAL_TAGS: continue # add styles for all matching DOM elements self.log(1, 'ELEMENT', id(element), element.text) if element not in view: # add initial empty style declatation view[element] = CSSStyleDeclaration() specificities[element] = {} # and add inline @style if present inlinestyle = styleCallback(element) if inlinestyle: for p in inlinestyle: # set inline style specificity view[element].setProperty(p) specificities[element][p.name] = (1, 0, 0, 0) for p in rule.style: # update style declaration if p not in view[element]: # setProperty needs a new Property object and # MUST NOT reuse the existing Property # which would be the same for all elements! # see Issue #23 view[element].setProperty(p.name, p.value, p.priority) specificities[element][p.name] = selector.specificity self.log(2, view[element].getProperty('color')) else: self.log(2, view[element].getProperty('color')) sameprio = (p.priority == view[element].getPropertyPriority(p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[element][p.name]): # later, more specific or higher prio view[element].setProperty(p.name, p.value, p.priority) _unmergable_css = _unmergable_rules.cssText if _unmergable_css: e = etree.Element('style') # print __name__, _unmergable_css.__repr__() e.text = to_unicode(_unmergable_css, 'utf-8') body = document.find('body') or document body.insert(0, e) # add <style> right into body return view
def cssselect(expr, tree): return CSSSelector(expr)(tree)
def Parse(reading): result = { "url": reading["url"] } text = reading["text"] text = re.sub("<p<", "", text) # this error too severe for parser to handle doc = lxml.html.parse(StringIO(text)) root = doc.getroot() #body = h.find(".//body") maindiv = CSSSelector("#divMiddleLeftCentreBottomRight")(root)[0] heading = CSSSelector("#divHeading h1")(maindiv)[0].text intro = CSSSelector("#divIntroduction h2")(maindiv)[0] h2 = lxml.etree.tounicode(intro) #print [heading, h2] mheading = re.match(u"([\w\s\-']*?)\s*(?:\u2013\s*(?:PPC for (.*?)$)?|$)", heading) result["name"] = mheading.group(1) mmpfor = re.search(u'(?:<br\s*/>)?\s*MP for (.*?)\s*<br\s*/>', h2) if mmpfor: result["MP for"] = mmpfor.group(1) result["MP for"] = result["MP for"] # needs to be regularized for the 2005 boundaries mcandidate = re.search(u'Liberal Democrat candidate for <a href="in_your_area_detail.aspx.*?">(.*?)</a>', h2) if mcandidate: result["constituency"] = RegularizeConstituency(mcandidate.group(1)) elif mheading.group(2): result["constituency"] = RegularizeConstituency(mheading.group(2)) elif "MP for" in result: result["constituency"] = RegularizeConstituency(result["MP for"]) else: assert False, (h2, heading) divImage = maindiv.cssselect("#divIntroduction a img") if divImage: result["image"] = divImage[0].get("src") #print maindiv.cssselect("#divAboutMe h2")[0].text, "About Me" for traboutme in maindiv.cssselect("#divAboutMe tr"): key = traboutme.cssselect("th")[0].text[:-1] assert key in ["Marital Status", "Occupation", "Education"] value = traboutme.cssselect("td")[0].text if value: value = re.sub(u"\u2019", "'", value).strip() value = re.sub(u"\u2013", "-", value) value = re.sub("\xae", "", value) value = re.sub("\s*\n\s*", "; ", value) result[key] = value divBiography = maindiv.cssselect("#divBiography") if divBiography: result["bio"] = SimplifyHTML(divBiography[0]) result["bio"] = re.sub("^Biography\s+", "", result["bio"]) # clean out leading title contacttext = lxml.etree.tounicode(maindiv.cssselect("#divIndividualContactInfo")[0]) memail = re.search('<strong>Email:</strong> <a href="(?:mailto:)?(.*?)">', contacttext) if memail: result["email"] = memail.group(1) mwebsite = re.search('<strong>Website:</strong> <a href="(.*?)">', contacttext) if mwebsite: result["website"] = mwebsite.group(1) mphone = re.search('<strong>Telephone:</strong> ([\d\s]+)', contacttext) if mphone: result["phone"] = mphone.group(1).strip() address = "; ".join([ addressline.text for addressline in maindiv.cssselect("#divIndividualContactInfo ul li") ]) if address: result["address"] = address.encode("ascii", "replace") # the database doesn't seem to be unicode. it should be return result
def get_anime_info(self, obj): """Returns an AnimeInfoObject. A url, or any meta object can be passed""" url = '' if isinstance(obj, basestring): if obj[:1] == '/': url = BASE_URL[:-1] + obj else: url = obj # Can take absolute url else: url = obj.get_url() # Any anime meta object content = self.conn.scrape.get(url) tree = lxml.html.fromstring(content.text) listing = CSSSelector('.bigBarContainer')(tree) if listing is None or len(listing) < 2: return None pgraphs = listing[0].cssselect('p') if pgraphs is None or len(pgraphs) < 5: return None extra_info = [] if len(pgraphs) == 6: # some animes dont have an air date extra_info = [x.strip() for x in pgraphs[3].itertext()] else: extra_info = [x.strip() for x in pgraphs[2].itertext()] if len(extra_info) < 5: # Not valid at all? return None title = listing[0].cssselect('.bigChar')[0] if title is None: title = 'N/A' else: title = title.text alt_names = [ x.text.strip().encode('utf-8') for x in pgraphs[0].cssselect('a') ] tags = [ x.text.strip().encode('utf-8') for x in pgraphs[1].cssselect('a') ] airdate = 'N/A' if len(pgraphs) == 6: # Only if we have 6 <p>'s, airdate is present airdate = "".join([x.strip() for x in pgraphs[2].itertext()])[11:] status = extra_info[2] views = extra_info[4] summary = 'N/A' if len(pgraphs) == 6: # Due to airdate, summary can be shifted summary = "".join([x.strip() for x in pgraphs[5].itertext()]) else: summary = "".join([x.strip() for x in pgraphs[4].itertext()]) ep_list = listing[1].cssselect('tr') ep_meta = [] if len(ep_list) > 2: for i in range(2, len(ep_list)): # First two are junk episode = ep_list[i] info = episode.cssselect('td') if len(info) < 2: continue ep_name = "".join([x.strip() for x in info[0].itertext()]) ep_url = BASE_URL[:-1] ep_url += info[0].cssselect('a')[0].get('href') ep_rel = info[1].text.strip() # Remove whitespace ep_meta.append(AnimeEpisodeInfoObject(ep_name, ep_url, ep_rel)) return AnimeEpisodeMetaObject(title, alt_names, tags, airdate, status, views, summary, ep_meta)
def buy(self,url): self.fd['city'] = self.citycode self.fd['house_flag'] = 3 # self.fd['belong']="1" request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):return Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = Dname.string else: self.fd['owner_name'] = None ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone'] = None else: self.fd['owner_phone'] = None #没有联系方式 return if not self.fd['owner_phone']:return if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: return self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 self.fd['house_type'] = 0 self.fd['house_age'] = 0 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 if re.search(self.house_totalarea_regex_qiu, response): house_totalarea=re.search(self.house_totalarea_regex_qiu, response).group(1) self.fd['house_totalarea'] = house_totalarea self.fd['house_totalarea_max'] = house_totalarea self.fd['house_totalarea_min'] = house_totalarea else: self.fd['house_totalarea'] = 0 self.fd['house_totalarea_max'] = 0 self.fd['house_totalarea_min'] = 0 if re.search(self.house_price_regex_gou, response): house_price_zu = re.search(self.house_price_regex_gou, response).group(1) house_price_zu = house_price_zu.replace('万','') if house_price_zu.find("以上") != -1: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = house_price_zu.replace('以上','') self.fd['house_price'] = self.fd['house_price_min'] elif house_price_zu.find("以下") != -1: self.fd['house_price_max'] = house_price_zu.replace('以下','') self.fd['house_price_min'] = 0 self.fd['house_price'] = self.fd['house_price_max'] elif house_price_zu.find("-") != -1: self.fd['house_price_max'] = house_price_zu.split('-')[1] self.fd['house_price_min'] = house_price_zu.split('-')[0] self.fd['house_price'] = house_price_zu.split('-')[1] else: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = 0 self.fd['house_price'] = 0 else: self.fd['house_price_max'] = 0 self.fd['house_price_min'] = 0 self.fd['house_price'] = 0 posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None if posttime: Y=int(time.strftime('%Y', time.localtime())) M=int(posttime.split(' ')[0].split('-')[0]) D=int(posttime.split(' ')[0].split('-')[1]) s = datetime.datetime(Y,M,D,0,0) posttime=int(time.mktime(s.timetuple())) self.fd['posttime'] =posttime else: self.fd['posttime'] =None if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = house_room else: self.fd['house_room'] = '0' if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = house_hall else: self.fd['house_hall'] = '0' if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = house_toilet else: self.fd['house_toilet'] = '0' house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = None d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if d_i.find(text="小区: "): borough_box = d_i.find(text="小区: ").parent borough_name = borough_box.find("a") if borough_name: self.fd['borough_name'] = borough_name.string else: self.fd['borough_name'] = None else: if re.search(self.borough_name_regex_reg, response): borough_name=re.search(self.borough_name_regex_reg, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.house_addr_regex_reg, response): house_addr=re.search(self.house_addr_regex_reg, response).group(1) self.fd['house_addr'] = house_addr else: self.fd['house_addr'] = '' #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = area_a[1].string elif area_a and len(area_a)==1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = None else: self.fd['cityarea'] = None self.fd['section'] = None
def find_by_css(self, selector): xpath = CSSSelector(selector).path return self.find_by_xpath(xpath, original_find="css", original_query=selector)
def rent(self,url): # self.fd['house_city'] = urlparse(url)[1].replace('.ganji.com',"") hc= urlparse(url)[1].replace('.ganji.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} raise tree = etree.HTML(response) if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: raise self.fd['house_flag'] = 2 self.fd['house_type'] = 6 self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 soup =BeautifulSoup(response) detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):raise Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = str(Dname.string) else: self.fd['owner_name'] = "" ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone_pic'] = None else: self.fd['owner_phone_pic'] = None #没有联系方式 return if not self.fd['owner_phone_pic']:raise if re.search(self.house_totalarea_regex, response): house_totalarea=re.search(self.house_totalarea_regex, response).group(1) self.fd['house_area'] = house_totalarea else: self.fd['house_area'] = None if re.search(self.house_price_regex_2, response): house_price=re.search(self.house_price_regex_2, response).group(1) if house_price=="面议": house_price=0 self.fd['house_price'] = int(house_price) else: self.fd['house_price'] = 0 # house_price=tree.xpath("/html/body/div[2]/div/div/ul/li/span") and tree.xpath("/html/body/div[2]/div/div/ul/li/span")[0].text.strip() or None # v['house_price'] = house_price posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None if posttime: Y=int(time.strftime('%Y', time.localtime())) M=int(posttime.split(' ')[0].split('-')[0]) D=int(posttime.split(' ')[0].split('-')[1]) H=int(time.strftime('%H',time.localtime(time.time()))) Min=int(time.strftime('%M',time.localtime(time.time()))) s = datetime.datetime(Y,M,D,H,Min) posttime=str(int(time.mktime(s.timetuple()))) self.fd['house_posttime'] =posttime else: s=time.localtime(time.time()) self.fd['house_posttime'] =str(int(time.mktime(s))) house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = house_room else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = house_hall else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = house_toilet else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = house_veranda else: self.fd['house_veranda'] = 0 if re.search(self.house_floor_regex, response): house_floor=re.search(self.house_floor_regex, response).group(1) house_topfloor=re.search(self.house_floor_regex, response).group(2) self.fd['house_floor'] = int(house_floor) self.fd['house_topfloor'] = int(house_topfloor) else: self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = None d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if d_i.find(text="小区: "): borough_box = d_i.find(text="小区: ").parent borough_name = borough_box.find("a") if borough_name: self.fd['borough_name'] = borough_name.string else: self.fd['borough_name'] = None #地址 if borough_name and borough_name.nextSibling: house_addr = borough_name.nextSibling.string self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr) else: self.fd['house_addr'] = None else: if re.search(self.borough_name_regex, response): borough_name=re.search(self.borough_name_regex, response).group(1) self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name) #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) Y=int(time.strftime('%Y', time.localtime())) house_age=Y-int(house_age) self.fd['house_age'] = house_age else: self.fd['house_age'] = 0 #朝向 if re.search(self.house_toward_regex, response): house_toward=re.search(self.house_toward_regex, response).group(1) self.fd['house_toward'] = toward(house_toward) else: self.fd['house_toward'] = 0 if re.search(self.house_fitment_regex, response): house_fitment=re.search(self.house_fitment_regex, response).group(1) self.fd['house_fitment'] = fitment(house_fitment) else: self.fd['house_fitment'] = 2 if re.search(self.house_deposit_regex, response): house_deposit=re.search(self.house_deposit_regex, response).group(1) self.fd['house_deposit'] = deposit(house_deposit) else: self.fd['house_deposit'] = None request = None response = None soup=None tree=None del tree del request del response del soup
import lxml.etree from lxml.cssselect import CSSSelector from BeautifulSoup import BeautifulSoup if len(sys.argv) < 2: print >>sys.stderr, 'usage: weather.py CITY, STATE' exit(2) data = urllib.urlencode({'inputstring': ' '.join(sys.argv[1:])}) info = urllib2.urlopen('http://forecast.weather.gov/zipcity.php', data) content = info.read() # Solution #1 parser = lxml.etree.HTMLParser(encoding='utf-8') tree = lxml.etree.fromstring(content, parser) big = CSSSelector('td.big')(tree)[0] if big.find('font') is not None: big = big.find('font') print 'Condition:', big.text.strip() print 'Temperature:', big.findall('br')[1].tail tr = tree.xpath('.//td[b="Humidity"]')[0].getparent() print 'Humidity:', tr.findall('td')[1].text print # Solution #2 soup = BeautifulSoup(content) # doctest: +SKIP big = soup.find('td', 'big') if big.font is not None: big = big.font print 'Condition:', big.contents[0].string.strip() temp = big.contents[3].string or big.contents[4].string # can be either
def require(self,url): hc= urlparse(url)[1].replace('.ganji.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} raise tree = etree.HTML(response) if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: raise self.fd['house_flag'] = 4 self.fd['house_type'] = 6 self.fd['house_floor'] = 0 self.fd['house_topfloor'] = 0 self.fd['house_area']=0 self.fd['house_age'] = 0 self.fd['house_toward'] = 0 self.fd['house_fitment'] = 0 self.fd['house_deposit'] = 0 # self.fd['house_totalarea_max'] = 0 # self.fd['house_totalarea_min'] = 0 soup =BeautifulSoup(response) detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):raise Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = Dname.string else: self.fd['owner_name'] = None ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone_pic'] = None else: self.fd['owner_phone_pic'] = None #没有联系方式 return if not self.fd['owner_phone_pic']:raise if re.search(self.house_price_regex_zu, response): house_price_zu = re.search(self.house_price_regex_zu, response).group(1) house_price_zu = house_price_zu.replace('元/月','') if house_price_zu.find("以上") != -1: self.fd['house_price_max'] = 0 self.fd['house_price'] = int(house_price_zu.replace('以上','')) elif house_price_zu.find("以下") != -1: self.fd['house_price_max'] = int(house_price_zu.replace('以下','')) self.fd['house_price'] = 0 elif house_price_zu.find("-") != -1: self.fd['house_price_max'] = int(house_price_zu.split('-')[1]) self.fd['house_price'] = int(house_price_zu.split('-')[0]) else: self.fd['house_price_max'] = 0 self.fd['house_price'] = 0 else: self.fd['house_price_max'] = 0 self.fd['house_price'] = 0 posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None if posttime: Y=int(time.strftime('%Y', time.localtime())) M=int(posttime.split(' ')[0].split('-')[0]) D=int(posttime.split(' ')[0].split('-')[1]) H=int(time.strftime('%H',time.localtime(time.time()))) Min=int(time.strftime('%M',time.localtime(time.time()))) s = datetime.datetime(Y,M,D,H,Min) posttime=str(int(time.mktime(s.timetuple()))) self.fd['house_posttime'] =posttime else: s=time.localtime(time.time()) self.fd['house_posttime'] =str(int(time.mktime(s))) house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = int(house_room) else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = int(house_hall) else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = int(house_toilet) else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = int(house_veranda) else: self.fd['house_veranda'] = 0 house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = "" d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if re.search(self.borough_name_regex_reg, response): borough_name=re.search(self.borough_name_regex_reg, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.house_addr_regex_reg, response): house_addr=re.search(self.house_addr_regex_reg, response).group(1) self.fd['house_addr'] = house_addr else: self.fd['house_addr'] = '' #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" request = None response = None soup=None tree=None del tree del request del response del soup
import sys import requests import lxml.html from lxml.cssselect import CSSSelector # get page url = sys.argv[1] page = requests.get(url).text page = page.replace('\xa0', ' ') tree = lxml.html.fromstring(page) # get title title_tag = CSSSelector('div#main h1')(tree)[0] title = title_tag.text_content() fb2 = title.find(' (fb2)') if fb2 != -1: title = title[:fb2] # get text text_tag = CSSSelector('div#main div._ga1_on_')(tree)[0] text = text_tag.text_content().strip() # get refs ref_sup_tags = CSSSelector('sup')(text_tag) ref_tags = [CSSSelector('a')(ref_sup_tag)[1] for ref_sup_tag in ref_sup_tags] refs = [ref_tag.get('title').strip() for ref_tag in ref_tags]
def autocomplete_input(self, et): return CSSSelector('input.autocomplete')(et)[0]
def _parse_html_for_translation(html): """ This function breaks down anchors and strips them into two divs. This will show up as two strings on transifex. :param html: :return: """ p = re.compile(r'<.*?>') if p.findall(html): html = unicode(BeautifulSoup(html).prettify()) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) a = CSSSelector('a') translatable_a = CSSSelector('a.translatable') img = CSSSelector('img:not(.image-translatable)') # Translatable anchors are split into text and links anchors = translatable_a(tree.getroot()) logger.info(str(anchors)) for anchor in anchors: attributes = [("data-a-{}".format(k), v) for k, v in dict(anchor.attrib).iteritems()] div = etree.Element('div') content = etree.parse( StringIO("<div class=\"text\">{}</div>".format( stringify_children(anchor)))).getroot() href_format = """<div class=\"href\">{}</div>""" href_html = fix_html_fragment( href_format.format(anchor.attrib['href'])) link = etree.parse(StringIO(href_html)).getroot() for k, v in attributes: div.attrib[k] = v div.attrib['class'] = 'former-anchor-translatable' div.append(content) div.append(link) swap_element(div, anchor) # Anchors are just the text anchors = a(tree.getroot()) for anchor in anchors: attributes = [("data-a-{}".format(k), v) for k, v in dict(anchor.attrib).iteritems()] anchor_format = "<div class=\"former-anchor\">{}</div>" anchor_html = fix_html_fragment( anchor_format.format(stringify_children(anchor))) div = etree.parse(StringIO(anchor_html)).getroot() for k, v in attributes: div.attrib[k] = v swap_element(div, anchor) # Images are just copies of the attributes images = img(tree.getroot()) for image in images: div = etree.Element('div') attributes = [("data-img-{}".format(k), v) for k, v in dict(image.attrib).iteritems()] for k, v in attributes: div.attrib[k] = v div.attrib['class'] = 'former-image' swap_element(div, image) html = etree.tostring(tree) # Chicken coop de grass # Massive regex that takes in phone numbers and puts them in divs # only to be postprocessed below and dissapear from the translations p = re.compile( r'((?:\+\s*)*\d+(?:\s+\(*\d+\)*)*\d+(?:\s+\d+\(*\)*)+|\d+(?:\s+\d+)+|00\d+(?:\s+\d+)+)' ) html = p.sub('<div class="former-tel">\g<1></div>', html) soup = BeautifulSoup(html) for div in soup.find_all('div'): tag_format = None while div.parent and div.parent.name in [ 'b', 'em', 'i', 'strong', 'u' ]: if div.parent.name == "b": div.parent.unwrap() tag_format = "<b>{}</b>" if div.parent.name == "strong": div.parent.unwrap() tag_format = "<strong>{}</strong>" if div.parent.name == "em": div.parent.unwrap() tag_format = "<em>{}</em>" if div.parent.name == "i": div.parent.unwrap() tag_format = "<i>{}</i>" if div.parent.name == "u": div.parent.unwrap() tag_format = "<u>{}</u>" if tag_format: children = "".join([unicode(c) for c in div.contents]) div.clear() child_soup = BeautifulSoup(tag_format.format(children)) if child_soup.body: child_frag = child_soup.body.next elif child_soup.html: child_frag = child_soup.html.next else: child_frag = child_soup div.append(child_frag) for n in soup.select('u, b, i, em, strong'): if not n.text.strip(): n.extract() for tel in soup.select('div.former-tel'): number = tel.text classes = ['former-tel'] if tel.select('b'): classes.append('has-b') if tel.select('em'): classes.append('has-em') if tel.select('strong'): classes.append('has-strong') if tel.select('i'): classes.append('has-i') if tel.select('u'): classes.append('has-u') tel.attrs['data-tel-number'] = number tel.attrs['class'] = classes tel.clear() return soup.prettify()
def main(): global COUNT site_url = "http://www.sanskritlibrary.org/" seed_url = "http://www.sanskritlibrary.org/textsList.html" titus_url = "http://titus.uni-frankfurt.de" p = Page(seed_url) a_tags = CSSSelector('a') div_tags = CSSSelector('div') span_tags = CSSSelector('span') body_tags = CSSSelector('body') div = [e for e in div_tags(p.dom) if e.get("class")=="text"] div = div[0] links = [site_url + i.get("href") for i in div.getchildren() if i.tag=='a'] print "Links of texts:", len(links) source_links = list() #Creating list of links for l in links: lpage = Page(l) slinks = [i.get("href") for i in a_tags(lpage.dom) if i.get("target")=="source"] source_links += slinks print "Links of sources:", len(source_links) #134 source_links = list(set(source_links)) print "Unique links of sources:", len(source_links) #94 #Considering only ramayana and mahabharat links source_links = [i for i in source_links if ("/mbh" in i or "/ram" in i)] pp.pprint(source_links) b = p.selenium_load() for link in source_links: lp = link print "SOURCE_LINK",link while lp: try: b.get(lp) sleep(0.25) b.switch_to_frame(b.find_elements_by_tag_name("frame")[0]) bdom=html.fromstring(b.page_source, parser=html.HTMLParser(encoding='utf-8')) bt = body_tags(bdom) if len(bt)==0: print "No body tag for " + lp continue body = bt[0] f = open("download/" + lp[lp.rfind("/")+1:]+".txt", 'w') f.write(body.text_content().encode('utf-8')) f.close() print "File no. " + str(COUNT) + " created" COUNT += 1 anchors = a_tags(bdom) lp = None for i in range(len(anchors)-1, max(0, len(anchors)-5), -1): if len(anchors[i].getchildren())==1 and anchors[i].getchildren()[0].tag=="img" and "arribar" in anchors[i].getchildren()[0].get("src"): href = anchors[i].get("href") lp = titus_url+href print i, len(anchors)-i print "New frame:", lp break except: lp = None
def select_all(tree, expr): sel = CSSSelector(expr) return sel(tree)
class GoogleNews: SELECTOR = { "title": CSSSelector(".l.lLrAF"), "summary": CSSSelector("div.st"), "date": CSSSelector(".f.nsa.fwzPFf"), "source": CSSSelector(".xQ82C.e8fRJf"), "source_url": CSSSelector(".top.NQHJEb.dfhHve"), "image_url": CSSSelector("img.th.BbeB2d") } def __init__(self, title, summary, date, source, source_url, image_url): self._title = title self._summary = summary self._date = date self._source = source self._source_url = source_url self._image_url = image_url @classmethod def from_source(cls, news_source): return cls( title=cls._get_element_content(news_source, "title"), summary=cls._get_element_content(news_source, "summary"), date=cls._get_element_content(news_source, "date"), source=cls._get_element_content(news_source, "source"), source_url=cls._get_element_attribute(news_source, "source_url", "href"), image_url=cls._get_element_attribute(news_source, "image_url", "src") ) @staticmethod def _get_element_content(source, element_name): selected_area = GoogleNews._select_area(source, element_name) if selected_area is not None: return selected_area.text_content() return None @staticmethod def _get_element_attribute(source, element_name, attribute_name): selected_area = GoogleNews._select_area(source, element_name) if selected_area is not None: return selected_area.get(attribute_name) return None @staticmethod def _select_area(source, selector_key): selected_area = GoogleNews.SELECTOR[selector_key](source) # if len(selected_area) > 1: # # raise GoogleNewsError("More than one selections match to current criteria") # return "" if len(selected_area) < 1: return None # raise GoogleNewsError("No selection match to current criteria") return selected_area.pop() def display(self): print(f"Date: {self._date}") print(self._title) print(self._summary) print(self._source) print(self._source_url) def as_json(self): json_representation = { "title": self._title, "summary": self._summary, "date": self._date, "source": self._source, "source_url": self._source_url, "image_url": self._image_url } return json_representation
def _select(self, selector_str): """ use css selector string to query corresponding etree elements """ sel = CSSSelector(selector_str) return (e for e in sel(self._tree.getroot()))
class GoogleSearch: BASE_URL = "https://www.google.com/search" BASE_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0", "Accept-Language": "pl,en-US;q=0.7,en;q=0.3" } SELECTORS = { "news": CSSSelector("div.g"), "unsuccessful_search": CSSSelector("div.mnr-c") } @classmethod def get_news_as_json(cls, query, start_date, end_date, limit=10): returned_news = cls.get_news( query=query, start_date=start_date, end_date=end_date, limit=limit ) return json.dumps([news.as_json() for news in returned_news]) @classmethod def get_news(cls, query, start_date, end_date, limit=10): """ Performs Google News search with given criteria. :param query: Main query to search for (content of search box) :param start_date: (Optional) start day of the search :param end_date: (Optional) end day of the search :param limit: (default=10) max number of news to return :return: list of GoogleNews """ parsed_news = list() page_index = 0 while True: raw_search_page = GoogleSearch._get_search_page( query=query, search_type="nws", start_date=start_date, end_date=end_date, page_index=page_index ) if not GoogleSearch.page_search_successful(raw_search_page): return parsed_news raw_all_news_on_page = GoogleSearch.SELECTORS["news"](raw_search_page) for raw_news in raw_all_news_on_page: if len(parsed_news) >= limit: return parsed_news parsed_news.append(GoogleNews.from_source(raw_news)) page_index += 1 @staticmethod def page_search_successful(raw_search_page): """ Checks if given page contains valid search results :param raw_search_page: parsed html page :return: True if results are valid; False otherwise """ selection = GoogleSearch.SELECTORS["unsuccessful_search"](raw_search_page) return not len(selection) @classmethod def get_images(cls): raise NotImplementedError() @staticmethod def _get_search_page(query, search_type, start_date, end_date, page_index): custom_date_range = f"cdr:1,cd_min:{start_date},cd_max:{end_date}" payload = { "q": query, "tbs": custom_date_range, "tbm": search_type, "start": page_index * 10 } response = requests.get( GoogleSearch.BASE_URL, params=payload, headers=GoogleSearch.BASE_HEADERS ) if response.status_code != 200: raise GoogleSearchError(f"Response status code was {response.status_code}") return html.fromstring(response.text)
import urllib.parse import time import random import unicodedata PATTERN = re.compile("^\s*(\d+)\s+(.+?)$", re.UNICODE) FN_VERB_LIST = "./all-verbs-count.txt" VERB_LIST = [] # with open(FN_VERB_LIST, "r") as fh: # VERB_LIST = [item.rstrip() for item in fh.readlines()] with open(FN_VERB_LIST, "rb") as fh: VERB_LIST = [item.decode("utf-8").rstrip() for item in fh.readlines()] LENGTH = len(VERB_LIST) TRANSLATE_STUB = "http://www.spanishdict.com/translate/" BIG_DICT = {"verbs": {}} MISMATCH_CSS = CSSSelector(".mismatch") TEST1 = re.compile("represents different", re.UNICODE) GET_INF1 = re.compile("\*\*.+?\*\* represents .+? \*\*(.+?)\*\*", re.UNICODE) TEST2 = re.compile("\*\*.+?\*\* is the", re.UNICODE) GET_INF2 = re.compile( "\*\*.+?\*\* is the (\w+) form of \*\*(.+?)\*\* in the (\w+ \w+) (\w+)", re.UNICODE) BK_REGEX = re.compile('<div variation-type="mismatch-verb.+?>(.+?)</div>', re.UNICODE) def parse_line(line): mat = PATTERN.search(line)
def _modify_nodes_inplace(root: etree._Element, css_selector_str: str, fn: Callable): sel = CSSSelector(css_selector_str, translator='html') for w in sel(root): fn(w)
def check_css(self, html, *selectors): ''' Checks if a series of CSS selectors are present in the HTML. For example: self.check_css(html, ('h1', 'Admin'), # first H1 is Admin ('h1', 'is', 'Admin'), # first H1 has text Admin ('img', 'is', {'src': 'X'}), # first img has src="X" ('h1', 1, 'X'}, # second H1 should have text X ('h1', -1, 'X'}, # last H1 should have text X ('h1', 'has', 'X'}, # any H1 should have text X ('h1', 'all', 'X'}, # all H1 should have text X ) ''' import re import lxml.html from lxml.cssselect import CSSSelector tree = lxml.html.fromstring(html) for selector in selectors: if len(selector) == 2: (css, val), how = selector, 'is' elif len(selector) == 3: css, how, val = selector else: raise ValueError('Selector %s must be a (css, how, val) triple' % selector) # Check all matching nodes. At least one node must exist nodes = CSSSelector(css)(tree) ok_(len(nodes) > 0, 'CSS %s missing' % css) # val must be a dict. Convert text values to dict. Raise error for rest if isinstance(val, six.string_types): val = {'@text': val} elif not isinstance(val, dict): raise ValueError('CSS %s has invalid value %s' % (css, val)) for attr, v in val.items(): if attr == '@text': actuals = [node.text for node in nodes] else: actuals = [node.get(attr, None) for node in nodes] # Try substring search. Else try regexp search regex = re.compile(v) match = lambda x: x in actual or regex.search(x) # noqa # First or specified selector should match v if how == 'is' or isinstance(how, int): actual = actuals[0 if how == 'is' else how] if not match(actual): self.fail('CSS %s@%s = %s != %s' % (css, attr, actual, v)) # Any selector should match v elif how in {'has', 'any'}: if not any(match(actual) for actual in actuals): self.fail('CSS %s@%s has no %s' % (css, attr, v)) # All selectors should match v elif how == 'all': if not all(match(actual) for actual in actuals): self.fail('CSS %s@%s is not all %s' % (css, attr, v)) else: raise ValueError('CSS %s: invalid how: "%s"' % (css, how)) return tree
def handle(self, *args, **options): if not args: return page_id, = args parser = etree.HTMLParser() selector = CSSSelector('body') # content = selector(tree.getroot()) dict_list = [] page = Page.objects.get(id=page_id) page = page.get_draft_object() for placeholder in page.get_placeholders(): for plugin in placeholder.get_plugins('en'): instance, t = plugin.get_plugin_instance() typename = type(t).__name__ if typename == 'TextPlugin': tree = etree.parse(StringIO.StringIO(instance.body), parser).getroot() for child in instance.get_children(): child_instance, child_type = child.get_plugin_instance() child_type_name = type(child_type).__name__ img = CSSSelector('[id=plugin_obj_{}]'.format(child_instance.id))(tree) if not img: child.delete() continue img = img[0] parent = img.getparent() element = None if child_type_name == "LinkPlugin": element = etree.Element('a', attrib={ "target": "_blank", "href": child_instance.url }) element.text = child_instance.name elif child_type_name == "CMSLinkButtonPlugin": element = etree.Element('a', attrib={ "class": "link-button", "target": "_blank", "href": child_instance.url }) element.text = child_instance.name if element is not None: parent.insert(parent.index(img), element) parent.remove(img) child.delete() body = selector(tree)[0] out = (body.text or '') + '\n'.join( [etree.tostring(h, pretty_print=True, method="html") for h in list(body)] ) instance.body = out instance.save()
def transform(self, pretty_print=True): """change the self.html and return it with CSS turned into style attributes. """ if etree is None: return self.html parser = etree.HTMLParser() stripped = self.html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. root = tree if stripped.startswith(tree.docinfo.doctype) else page if page is None: print repr(self.html) raise PremailerError("Could not parse the html") assert page is not None ## ## style selectors ## rules = [] index = 0 for element in CSSSelector('style,link[rel~=stylesheet]')(page): # If we have a media attribute whose value is anything other than # 'screen', ignore the ruleset. media = element.attrib.get('media') if media and media != 'screen': continue is_style = element.tag == 'style' if is_style: css_body = element.text else: href = element.attrib.get('href') if not href: continue css_body = self._load_external(href) these_rules, these_leftover = self._parse_style_rules( css_body, index) index += 1 rules.extend(these_rules) parent_of_element = element.getparent() if these_leftover: if is_style: style = element else: style = etree.Element('style') style.attrib['type'] = 'text/css' style.text = '\n'.join([ '%s {%s}' % (k, make_important(v)) for (k, v) in these_leftover ]) if self.method == 'xml': style.text = etree.CDATA(style.text) if not is_style: element.addprevious(style) parent_of_element.remove(element) elif not self.keep_style_tags or not is_style: parent_of_element.remove(element) if self.external_styles: for stylefile in self.external_styles: css_body = self._load_external(stylefile) these_rules, these_leftover = self._parse_style_rules( css_body, index) index += 1 rules.extend(these_rules) # rules is a tuple of (specificity, selector, styles), where specificity is a tuple # ordered such that more specific rules sort larger. rules.sort(key=operator.itemgetter(0)) first_time = [] first_time_styles = [] for __, selector, style in rules: new_selector = selector class_ = '' if ':' in selector: new_selector, class_ = re.split(':', selector, 1) class_ = ':%s' % class_ # Keep filter-type selectors untouched. if class_ in FILTER_PSEUDOSELECTORS: class_ = '' else: selector = new_selector sel = CSSSelector(selector) for item in sel(page): old_style = item.attrib.get('style', '') if not item in first_time: new_style = merge_styles(old_style, style, class_) first_time.append(item) first_time_styles.append((item, old_style)) else: new_style = merge_styles(old_style, style, class_) item.attrib['style'] = new_style self._style_to_basic_html_attributes(item, new_style, force=True) # Re-apply initial inline styles. for item, inline_style in first_time_styles: old_style = item.attrib.get('style', '') if not inline_style: continue new_style = merge_styles(old_style, inline_style, class_) item.attrib['style'] = new_style self._style_to_basic_html_attributes(item, new_style, force=True) if self.remove_classes: # now we can delete all 'class' attributes for item in page.xpath('//@class'): parent = item.getparent() del parent.attrib['class'] ## ## URLs ## if self.base_url: for attr in ('href', 'src'): for item in page.xpath("//@%s" % attr): parent = item.getparent() if attr == 'href' and self.preserve_internal_links \ and parent.attrib[attr].startswith('#'): continue if not self.base_url.endswith('/'): self.base_url += '/' parent.attrib[attr] = urlparse.urljoin( self.base_url, parent.attrib[attr].strip('/')) out = etree.tostring(root, method=self.method, pretty_print=pretty_print) if self.method == 'xml': out = _cdata_regex.sub( lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out) if self.strip_important: out = _importants.sub('', out) return out
def extract_user_node(self, block): """ Step 1: Look for all css classes corresponding to possible user blocks Step 2: Find valid user nodes out of possible nodes (needs to contain link to user profile) Step 3: Once valid block identified, extract and return corresponding information :param block: lxml node corresponding to a block :return: a tuple containing: - the lxml node corresponding the user block inside the input block - the corresponding user css class - the link to the user's profile - the text inside the user block (usually the user name) """ block_copy = deepcopy(block) block_string = str(etree.tostring(block_copy)) # Step 1 css_classes_list = extract_css_class(block_string).split(' ') user_css_class = None user_css_classes = [] for css_class in css_classes_list: if 'user' in css_class or 'author' in css_class: user_css_classes.append(css_class) if len(user_css_classes) > 0: # Step 2 # There could be possible user_nodes, we only take the one the satisfies extra criteria user_nodes = [] for user_css_class in user_css_classes: user_nodes += list(CSSSelector('.{}'.format(user_css_class))(block_copy)) valid_user_nodes = [] # list of tuples containing (user node, list of links in user node) for _user_node in user_nodes: links = _user_node.iterlinks() filtered_links = [] # Filter links 1) on site 2) a tags 3) duplicates 4) text content not null 5) contains no dates # Idea is when a user is mentioned, there is always a link to his profile for link in links: if link[0].tag == 'a': if urlparse(link[2]).netloc == urlparse(self.url).netloc: if link[1] not in [link[1] for link in filtered_links]: if len(self.extract_text_content(link[0])) > 0: if not self.contains_date(link[0]): filtered_links.append(link) if len(filtered_links) > 0: valid_user_nodes.append((_user_node, filtered_links)) break # Step 3 if len(valid_user_nodes) == 0: user_node = None user_link = None user_text = None else: # No valid user nodes user_node_pair = valid_user_nodes[0] # Take the first node user_node = user_node_pair[0] filtered_links = user_node_pair[1] if len(filtered_links) > 1: self.logger.debug('multiple user links found') user_link = filtered_links[0][2] user_text = self.extract_text_content(filtered_links[0][0]) else: # No valid user nodes user_node = None user_link = None user_text = None return user_node, user_css_class, user_link, user_text
from concurrent import futures import logging from multiprocessing import cpu_count from lxml import html from lxml.cssselect import CSSSelector from urlparse import urljoin from .. import SITE_URL from ..cache import cached_storage TALKS_LIST_URL_FMT = "http://www.ted.com/talks/quick-list?page=%d" _PAGINATION_INFO_SELECTOR = CSSSelector('div.pagination a:nth-last-of-type(1)') _TALKS_URLS_SELECTOR = CSSSelector('div.quick-list__row div.title span a') TALKS_URLS_BLACKLIST = [ # No downloads 'http://www.ted.com/talks/rokia_traore_sings_m_bifo.html', 'http://www.ted.com/talks/rokia_traore_sings_kounandi.html', 'http://www.ted.com/talks/andrew_stanton_the_clues_to_a_great_story.html', ] def _parse_page(page_num): return html.parse(TALKS_LIST_URL_FMT % page_num) def _get_num_pages(): logging.debug('Trying to find out the number of talk list pages...') elements = _PAGINATION_INFO_SELECTOR(_parse_page(1)) num_pages = int(elements[0].text_content())
# coding: utf-8 import lxml.html from lxml.cssselect import CSSSelector import requests link = requests.get( 'http://www.ieee.org/conferences_events/conferences/search/index.html') html = lxml.html.fromstring(link.text) study = CSSSelector('div.content-r-full table.nogrid-nopad tr p>a[href]') lines = study(html) n = 0 for line in lines: if n % 3 == 0: print "Conference name: ", line.text print "=============" n += 1 elif n % 3 == 1: print "Conference Date: ", line.text print "=============" n += 1 elif n % 3 == 2: print "Location: ", line.text print "=============" n += 1
BASE_URL = 'https://london.hackspace.org.uk/' cookiejar = cookielib.CookieJar() processor = urllib2.HTTPCookieProcessor(cookiejar) opener = urllib2.build_opener(processor) urllib2.install_opener(opener) def browse(url, params=None): if params is not None: params = urlencode(params) page = urllib2.urlopen(BASE_URL + url, params) return etree.HTML(page.read()) find_exception = CSSSelector('.alert-danger') if len(sys.argv) > 1: print 'Checking for card... (scan card on the RFID reader attached to this computer)' uid = None while uid is None: try: with rfid.Pcsc.reader() as reader: for tag in reader.pn532.scan(): uid = tag.uid.upper() break except rfid.NoCardException: pass time.sleep(0.1)
class DocumentParser(RISParser): """""" # an easy way to get there selectors is to use firefox and copy the unique selector from the dev tools # #adoption_css = CSSSelector("#rismain table.risdeco tbody tr td table.tk1 tbody tr td table.tk1 tbody tr td table tbody tr.zl12 td.text3") #adoption_css = CSSSelector("table.risdeco tr td table.tk1 tr td.ko1 table.tk1 tr td table tr.zl12 td.text3") adoption_css = CSSSelector("tr.zl12:nth-child(3) > td:nth-child(5)") # selects the td which holds status information such as "beschlossen" top_css = CSSSelector("tr.zl12:nth-child(3) > td:nth-child(7) > form:nth-child(1) > input:nth-child(1)") # selects the td which holds the link to the TOP with transcript table_css = CSSSelector(".ko1 > table:nth-child(1)") # table with info block attachments_css = CSSSelector("table.tk1:nth-child(23)") #main_css = CSSSelector("#rismain table.risdeco") MD5_FIELDS = ['docs', 'betreff', 'federführend'] city = "Aachen" def __init__(self, url, tzinfo = timezone('Europe/Berlin'), months = 12, **kwargs): self.utc = pytz.utc self.tzinfo = tzinfo self.consultation_list_start = False super(DocumentParser, self).__init__(url, **kwargs) # this will be moved to the second stage #self.db.documents.remove() @classmethod def construct_instance(cls, args): """construct the parse instance""" bu = args.base_url if not bu.endswith("/"): bu = bu + "/" url = bu+"vo020.asp?VOLFDNR=%s" return cls(url, city = args.city, mongodb_host = args.mongodb_host, mongodb_port = args.mongodb_port, mongodb_name = args.mongodb_name, force = args.force ) def before_save(self, data): """hook which is called with all the data for a document just before it's saved to the database. You have to return a data object yourself """ return data def preprocess_text(self, text): """preprocess the incoming text, e.g. do some encoding etc.""" return text def process(self, force = True): """process documents""" # get all the ids of the documents we need to parse agenda_items = self.db.agenda_items.find({ "city" : self.city, }) print "processing %s agenda items" %agenda_items.count() document_ids = [item['volfdnr'] for item in agenda_items if "volfdnr" in item] print "processing %s documents" %len(document_ids) #self.process_document("11768", True) # had wrong last_discussed #self.process_document("10745", True) # street is "Ludwig Forum" #self.process_document("12811", True) # street is "Hof" but shouldn't be #return #print document_ids for document_id in document_ids: self.process_document(document_id, force = self.force) return def process_document(self, document_id, force = False): """process a single document :param document_id: id of document to parse :param force: if True then reread the document regardless of whether we have it already in the db or not """ print "trying document %s:%s" %(self.city, document_id) found = False try: data = self.db.documents.find_one({ '_id' : "%s:%s" %(self.city, document_id), 'document_id' : str(document_id), 'city' : self.city, }) found = True except Exception, e: print "problem when trying to find document id %s: %s" %(document_id, e) # we did not find any old data, so lets create an empty one found = False if data is None: data = { '_id' : "%s:%s" %(self.city, document_id), 'document_id' : document_id, 'document_url' : self.url %document_id, 'last_discussed' : TIME_MARKER, # date of last appearance in a meeting 'created' : datetime.datetime.now(),# for our own reference } found = False if found and not force: print "%s already read" %document_id return url = self.url %document_id print "reading", url self.response = response = requests.get(url) if "noauth" in response.url: print "*** no permission to read %s" %url print return text = self.preprocess_text(response.text) doc = html.fromstring(text) # Check info block try: table = self.table_css(doc)[0] # lets hope we always have this table except: # for some reason on some runs this can't be found but the next one it can so we are saving it for now. print "**** INFO TABLE NOT FOUND, ABORTING document processing" fn = "/tmp/pyallris-error-%s-%s.html" %(self.city, document_id) fp = codecs.open(fn, "w", "utf-8") fp.write(text) fp.close() return self.consultation_list_start = False for line in table: headline = line[0].text if headline: headline = headline.split(":")[0].lower() if headline[-1]==":": headline = headline[:-1] if headline == "betreff": e = etree.tostring(line[1], encoding="utf-8") e = unicode(e, "utf-8") # as etree does not return unicode value = html2text.html2text(e) data[headline] = value elif headline in ['status', 'verfasser', u'federführend']: data[headline] = line[1].text.strip() elif headline == "beratungsfolge": # the actual list will be in the next row inside a table, so we only set a marker data = self.parse_consultation_list_headline(line, data) # for parser which have the consultation list here elif self.consultation_list_start: data = self.parse_consultation_list(line, data) # for parser which have the consultation list in the next tr self.consultation_list_start = False # set the marker to False again as we have read it # we simply ignore the rest (there might not be much more actually) # the actual text comes after the table in a div but it's not valid XML or HTML this using regex docs = body_re.findall(self.response.text) data['docs'] = docs data = utils.update_md5(data, self.MD5_FIELDS) data['city'] = self.city plaintext = data.get("betreff", "").lower() md = "" for d in data.get("docs"): plaintext = plaintext + " " + html2text.html2text(d.lower()) md = md + "\n\n\n--------------------------------------------------------------------------------\n\n\n" + html2text.html2text(d) data['markdown'] = md streets = {} # this stores official street name => street._id geolocations = [] geolocation = None for street in self.streets.keys(): if re.search(r"\b" + re.escape(street) + r"\b", plaintext): s = self.streets[street] streets[s['original']] = s['_id'] if "lat" in s: sname = s['original'].replace(".",":") # we have to replace dots for mongodb keys. So we use a : loc = { 'name' : s['original'], 'lat' : s["lat"], 'lon' : s["lng"] } geolocations.append(loc) # we now store the location of the first street in our database for the geo index if geolocation is None: geolocation = {'lat' : s["lat"], 'lon' : s["lng"]} #data['streets'] = streets data['geolocations'] = geolocations data['geolocation'] = geolocation data = self.before_save(data) #pprint.pprint(data) self.db.documents.save(data) time.sleep(1) return # we do attachments later, for now we save that stuff without # get the attachments if possible attachments = self.attachments_css(doc) if len(attachments)>0 and attachments[0][1][0].text.strip() == "Anlagen:": for tr in attachments[0][3:]: nummer = tr[1].text link = tr[2][0] href = link.attrib["href"] name = link.text # TODO: save it return
def Parse(reading): result = {"url": reading["url"]} text = reading["text"] text = re.sub("<p<", "", text) # this error too severe for parser to handle doc = lxml.html.parse(StringIO(text)) root = doc.getroot() #body = h.find(".//body") maindiv = CSSSelector("#divMiddleLeftCentreBottomRight")(root)[0] heading = CSSSelector("#divHeading h1")(maindiv)[0].text intro = CSSSelector("#divIntroduction h2")(maindiv)[0] h2 = lxml.etree.tounicode(intro) #print [heading, h2] mheading = re.match(u"([\w\s\-']*?)\s*(?:\u2013\s*(?:PPC for (.*?)$)?|$)", heading) result["name"] = mheading.group(1) mmpfor = re.search(u'(?:<br\s*/>)?\s*MP for (.*?)\s*<br\s*/>', h2) if mmpfor: result["MP for"] = mmpfor.group(1) result["MP for"] = result[ "MP for"] # needs to be regularized for the 2005 boundaries mcandidate = re.search( u'Liberal Democrat candidate for <a href="in_your_area_detail.aspx.*?">(.*?)</a>', h2) if mcandidate: result["constituency"] = RegularizeConstituency(mcandidate.group(1)) elif mheading.group(2): result["constituency"] = RegularizeConstituency(mheading.group(2)) elif "MP for" in result: result["constituency"] = RegularizeConstituency(result["MP for"]) else: assert False, (h2, heading) divImage = maindiv.cssselect("#divIntroduction a img") if divImage: result["image"] = divImage[0].get("src") #print maindiv.cssselect("#divAboutMe h2")[0].text, "About Me" for traboutme in maindiv.cssselect("#divAboutMe tr"): key = traboutme.cssselect("th")[0].text[:-1] assert key in ["Marital Status", "Occupation", "Education"] value = traboutme.cssselect("td")[0].text if value: value = re.sub(u"\u2019", "'", value).strip() value = re.sub(u"\u2013", "-", value) value = re.sub("\xae", "", value) value = re.sub("\s*\n\s*", "; ", value) result[key] = value divBiography = maindiv.cssselect("#divBiography") if divBiography: result["bio"] = SimplifyHTML(divBiography[0]) result["bio"] = re.sub("^Biography\s+", "", result["bio"]) # clean out leading title contacttext = lxml.etree.tounicode( maindiv.cssselect("#divIndividualContactInfo")[0]) memail = re.search('<strong>Email:</strong> <a href="(?:mailto:)?(.*?)">', contacttext) if memail: result["email"] = memail.group(1) mwebsite = re.search('<strong>Website:</strong> <a href="(.*?)">', contacttext) if mwebsite: result["website"] = mwebsite.group(1) mphone = re.search('<strong>Telephone:</strong> ([\d\s]+)', contacttext) if mphone: result["phone"] = mphone.group(1).strip() address = "; ".join([ addressline.text for addressline in maindiv.cssselect("#divIndividualContactInfo ul li") ]) if address: result["address"] = address.encode( "ascii", "replace" ) # the database doesn't seem to be unicode. it should be return result
def getView(document, css, media='all', name=None, styleCallback=lambda element: None): """ document a DOM document, currently an lxml HTML document css a CSS StyleSheet string media: optional TODO: view for which media it should be name: optional TODO: names of sheets only styleCallback: optional should return css.CSSStyleDeclaration of inline styles, for html a style declaration for ``element@style``. Gets one parameter ``element`` which is the relevant DOMElement returns style view a dict of {DOMElement: css.CSSStyleDeclaration} for html """ sheet = cssutils.parseString(css) view = {} specificities = {} # needed temporarily # TODO: filter rules simpler?, add @media rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) for rule in rules: for selector in rule.selectorList: log(0, 'SELECTOR', selector.selectorText) # TODO: make this a callback to be able to use other stuff than lxml cssselector = CSSSelector(selector.selectorText) matching = cssselector.evaluate(document) for element in matching: #if element.tag in ('div',): # add styles for all matching DOM elements log(1, 'ELEMENT', id(element), element.text) if element not in view: # add initial empty style declatation view[element] = cssutils.css.CSSStyleDeclaration() specificities[element] = {} # and add inline @style if present inlinestyle = styleCallback(element) if inlinestyle: for p in inlinestyle: # set inline style specificity view[element].setProperty(p) specificities[element][p.name] = (1,0,0,0) for p in rule.style: # update style declaration if p not in view[element]: # setProperty needs a new Property object and # MUST NOT reuse the existing Property # which would be the same for all elements! # see Issue #23 view[element].setProperty(p.name, p.value, p.priority) specificities[element][p.name] = selector.specificity log(2, view[element].getProperty('color')) else: log(2, view[element].getProperty('color')) sameprio = (p.priority == view[element].getPropertyPriority(p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[element][p.name]): # later, more specific or higher prio view[element].setProperty(p.name, p.value, p.priority) #pprint(view) return view
def transform(self, pretty_print=True, **kwargs): """change the self.html and return it with CSS turned into style attributes. """ if etree is None: return self.html if self.method == 'xml': parser = etree.XMLParser(ns_clean=False, resolve_entities=False) else: parser = etree.HTMLParser() stripped = self.html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. root = tree if stripped.startswith(tree.docinfo.doctype) else page if page is None: print repr(self.html) raise ValueError("Could not parse the html") assert page is not None ## style tags for element in CSSSelector('style,link[rel~=stylesheet]')(page): # If we have a media attribute whose value is anything other than # 'screen', ignore the ruleset. media = element.attrib.get('media') if media and media != 'screen': continue is_style = element.tag == 'style' if is_style: css_body = element.text else: href = element.attrib.get('href') if not href: continue css_body = self._load_external(href) self._parse_style_rules(css_body) parent_of_element = element.getparent() if not self.keep_style_tags or not is_style: parent_of_element.remove(element) ## explicitly defined external style file if self.external_styles: for stylefile in self.external_styles: css_body = self._load_external(stylefile) self._parse_style_rules(css_body) for tag_classes in page.xpath('//@class'): tag = tag_classes.getparent() tag_classes = [ '.' + c.strip() for c in tag_classes.split(' ') if c.strip() ] for tag_class in tag_classes: if tag_class in self.rules: old_style = tag.attrib.get('style', '') new_style = self.rules[tag_class] if old_style: new_style = '; '.join([old_style, new_style]) tag.attrib['style'] = new_style if self.remove_classes: # now we can delete all 'class' attributes for item in page.xpath('//@class'): parent = item.getparent() del parent.attrib['class'] kwargs.setdefault('method', self.method) kwargs.setdefault('pretty_print', pretty_print) out = etree.tostring(root, **kwargs) if self.method == 'xml': out = _cdata_regex.sub( lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out) if self.strip_important: out = _importants.sub('', out) return out
_htmlTree = etree.HTML(_html) result = etree.tostring(_htmlTree, pretty_print=True, method="html") print len(result) nodes = _htmlTree.xpath('//*[@href]') print len(nodes) for i, node in enumerate(nodes): if i < 20: print i, node.attrib import lxml.html from lxml.cssselect import CSSSelector import requests r = requests.get('http://python.org/') html = lxml.html.fromstring(r.text) sel = CSSSelector('a[href]') # Apply the selector to the DOM tree. nodes = sel(html) print len(nodes) for i, node in enumerate(nodes): #print lxml.html.tostring(item) if i < 20: print i, node.get('href'), node.text import lxml.html from lxml.cssselect import CSSSelector import requests r = requests.get('http://python.org/') html = lxml.html.fromstring(r.text) sel = CSSSelector('a[href]')
import requests import re import json from lxml import html, etree from lxml.cssselect import CSSSelector verbose = True SITE_URL = 'http://www.filmweb.pl' select_topics_links = CSSSelector('.topics-list h3 a') select_first_post = CSSSelector('.firstPost') select_post_author = CSSSelector('.userName') select_date_time = CSSSelector('.cap') select_points = CSSSelector('.plusCount') select_post_info = CSSSelector('.postInfo') select_post_text = CSSSelector('.text') select_title = CSSSelector('h1 a') def get_opinion(opinion_url): response = requests.get(opinion_url) tree = html.fromstring(response.content) first_post = select_first_post(tree)[0] rating_match = re.search(rb'(\d+) <i', etree.tostring(select_post_info(first_post)[0])) post_text_el = select_post_text(first_post)[0] etree.strip_elements(post_text_el, "*", with_tail=False) opinion = { 'author': select_post_author(first_post)[0].text.strip(), 'date': select_date_time(first_post)[0].get('title'), 'rating': int(rating_match.group(1)) if rating_match else None,
def sell(self,url): request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return tree = etree.HTML(response) soup =BeautifulSoup(response) self.fd['house_flag'] = 1 self.fd['belong']=0 detail_mer = soup.find('div',{'class':'detail_mer'}) #非个人房源 return if u"个人房源" not in str(detail_mer):return Dname = detail_mer.find('span',{'class':'Dname'}) if Dname: self.fd['owner_name'] = Dname.string else: self.fd['owner_name'] = None ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'}) if ganji_phone_call_class: self.fd['owner_phone'] = ganji_phone_call_class.contents[0] if str(ganji_phone_call_class).find('src='): self.fd['owner_phone'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src'] else: self.fd['owner_phone'] = None else: self.fd['owner_phone'] = None #没有联系方式 return if not self.fd['owner_phone']:return if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response): cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1) self.fd['cityname'] = cityname else: return if re.search(self.house_floor_regex, response): house_floor=re.search(self.house_floor_regex, response).group(1) house_topfloor=re.search(self.house_floor_regex, response).group(2) self.fd['house_floor'] = house_floor self.fd['house_topfloor'] = house_topfloor else: self.fd['house_floor'] = None self.fd['house_topfloor'] = None if re.search(self.house_totalarea_regex, response): house_totalarea=re.search(self.house_totalarea_regex, response).group(1) self.fd['house_totalarea'] = house_totalarea else: self.fd['house_totalarea'] = None #类型 if re.search(self.house_type_regex, response): house_type=re.search(self.house_type_regex, response).group(1) self.fd['house_type'] = housetype(house_type) else: self.fd['house_type'] = None if re.search(self.house_price_regex, response): house_price=re.search(self.house_price_regex, response).group(1) if house_price=="面议": house_price="0" self.fd['house_price'] = house_price else: self.fd['house_price'] = None posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None if posttime: Y=int(time.strftime('%Y', time.localtime())) M=int(posttime.split(' ')[0].split('-')[0]) D=int(posttime.split(' ')[0].split('-')[1]) s = datetime.datetime(Y,M,D,0,0) posttime=int(time.mktime(s.timetuple())) self.fd['posttime'] =posttime else: self.fd['posttime'] =None if re.search(self.house_room_regex, response): house_room=re.search(self.house_room_regex, response).group(1) self.fd['house_room'] = house_room else: self.fd['house_room'] = '0' if re.search(self.house_hall_regex, response): house_hall=re.search(self.house_hall_regex, response).group(1) self.fd['house_hall'] = house_hall else: self.fd['house_hall'] = '0' if re.search(self.house_toilet_regex, response): house_toilet=re.search(self.house_toilet_regex, response).group(1) self.fd['house_toilet'] = house_toilet else: self.fd['house_toilet'] = '0' house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") #描述 detail_box = soup.find('div',{'class':'detail_box'}) if detail_box: house_desc = str(detail_box('p')[1]) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc) else: self.fd['house_desc'] = None d_i = soup.find('ul',{'class':'d_i'}) #小区名 #先处理JS if re.search(self.xiaoqu_regex, response): borough_name=re.search(self.xiaoqu_regex, response).group(1) self.fd['borough_name'] = borough_name if re.search(self.address_regex, response): house_addr=re.search(self.address_regex, response).group(1) self.fd['house_addr'] = house_addr else: if d_i.find(text="小区: "): borough_box = d_i.find(text="小区: ").parent borough_name = borough_box.find("a") if borough_name: self.fd['borough_name'] = borough_name.string else: self.fd['borough_name'] = None #地址 if borough_name and borough_name.nextSibling: house_addr = borough_name.nextSibling.string self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr) else: self.fd['house_addr'] = None else: if re.search(self.borough_name_regex, response): borough_name=re.search(self.borough_name_regex, response).group(1) self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name) #区域 area_box = d_i.find(text="区域: ").parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = area_a[1].string elif area_a and len(area_a)==1: self.fd['cityarea'] = area_a[0].string self.fd['section'] = None else: self.fd['cityarea'] = None self.fd['section'] = None if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) self.fd['house_age'] = house_age else: self.fd['house_age'] = None #朝向 if re.search(self.house_toward_regex, response): house_toward=re.search(self.house_toward_regex, response).group(1) self.fd['house_toward'] = toward(house_toward) else: self.fd['house_toward'] = None if re.search(self.house_fitment_regex, response): house_fitment=re.search(self.house_fitment_regex, response).group(1) self.fd['house_fitment'] = fitment(house_fitment) else: self.fd['house_fitment'] = 2 request = None response = None soup=None tree=None del tree del request del response del soup