def extract_comments(html): tree = lxml.html.fromstring(html) item_sel = cs('.comment-item') text_sel = cs('.comment-text-content') time_sel = cs('.time') author_sel = cs('.user-name') for item in item_sel(tree): yield {'cid': item.get('data-cid'), 'text': text_sel(item)[0].text_content(), 'time': time_sel(item)[0].text_content().strip(), 'author': author_sel(item)[0].text_content(), 'tag': get_comment_sentiment(text_sel(item)[0].text_content())}
def get_scores(text): doc = document_fromstring(text) trs = cs("table table table[bgcolor=\"#666666\"] tr")(doc) trs = [tr for tr in trs if tr.text_content()] ret = {} group = None for tr in trs: if group is None: group = strip(tr.text_content().strip()) ret[group] = {} continue if not ret[group]: ret[group]["updated"] = strip(tr.text_content()) elif len(tr) == 4: ret[group].setdefault("scores", []).append( { "time": strip(tr[0].text_content()), "home": strip(tr[1].text_content()), "away": strip(tr[3].text_content()), "score": strip(tr[2].text_content()), }) else: group = strip(tr.text_content()) ret[group] = {} return ret
def text(selector, html): res = cs(selector)(html) if not res: return "" if res and len(res) == 1: return res[0].text_content().strip() res = map(lambda x: x.text_content().strip(), res) return "".join(res)
def extract_title(fragment): if not has_lxml: return "" doc = document_fromstring(fragment) try: return cs('h1')(doc)[0].text_content() except: import traceback traceback.print_exc() return ""
def summarize(content, url=""): """Return a summary for an html document. If a URL is passed, it may be treated specially to give better results, eg. twitter will return the tweet.""" html = document_fromstring(content) if url: parsed = urlparse.urlparse(url) if parsed.netloc.endswith("twitter.com") and "status" in url: tweet = text(".permalink-tweet .tweet-text", html) try: username = cs(".permalink-tweet")(html)[0].attrib["data-screen-name"] return "@%s: %s" % (username, tweet) except: return tweet # try to return opengraph description or title first, then just the <title> ogdesc = first("meta[property=\"og:description\"]", html) if ogdesc: return utils.maxlen(ogdesc.attrib["content"]) ogtitle = first("meta[property=\"og:title\"]", html) if ogtitle: return utils.maxlen(ogtitle.attrib["content"]) return text("title", html)
def extract_reply_cids(html): tree = lxml.html.fromstring(html) sel = cs('.comment-replies-header > .load-comments') return [i.get('data-cid') for i in sel(tree)]
from django.core.paginator import Paginator, InvalidPage, EmptyPage from django.db.models import Q from django.http import Http404, HttpResponseRedirect, HttpResponse from django.shortcuts import render_to_response from django.template import RequestContext from laws import models from laws.models import SectionFile, SearchForm from utils.searchtext import searchtext_sphinx, searchtext_FTS4 from utils.utils import * from operator import itemgetter from lxml import html, etree from lxml.cssselect import CSSSelector as cs import settings bodysel = cs('body') def target_remove(request): current_url = request.get_full_path() new_url = current_url.replace('target/','') return HttpResponseRedirect(new_url) @render_to("code_display.html") def target_to_section(request, codename, target_section): if codename == 'this': current_url = request.META['HTTP_REFERER'] codename = current_url.split('-')[1] #For Table of Contents, there is a trailing / that needs to be removed codename = codename.strip('/') print request.get_full_path() #Hack to ensure there is one, and only one './' at the end of the url
# Python # run popCode() then main() import os, sys, re, subprocess, fnmatch import pickle from subprocess import PIPE, Popen from lxml import html, etree from lxml.cssselect import CSSSelector as cs from laws import models # Settings regfile = "./utils/sectionlist.txt" # Folder that holds the legislation files pathin = "./media/cacodegit/" divisions = cs("div") def unix_find(pathin): """Return results similar to the Unix find command run without options i.e. traverse a directory tree and return all the file paths """ for root, dirs, files in os.walk(pathin): for filename in [filename for filename in files if not fnmatch.fnmatch(filename, ".*")]: yield os.path.join(root, filename) # Populate the Code table codedictpath = "./utils/codedict" codedict = pickle.load(open(codedictpath, "rb"))
codedictpath = '/Users/tabulaw/Documents/workspace/calaw/codedict' codedict = pickle.load(open(codedictpath,'rb')) def popCode(): for codeabbr in codedict: saveCode(codeabbr) def saveCode(codeabbr): code_current = models.Code( name = codeabbr, fullname = codedict[codeabbr], url = '/laws/target/'+codeabbr+'/' ) code_current.save() # Parse sections and save each to the db divsel = cs('div') def getSectionsHTML(inputfiletext): #a = open(inputfile) # Grabs the whole page #inputfiletext = read(a) #a.close() #tree = html.parse(inputfiletext) tree = html.document_fromstring(inputfiletext) sections = divsel(tree) # creates a list of the div elements of the document #creates a list of tuples for each section: section number and html content sections_html = [(section.get("id"), etree.tostring(section)) for section in sections] return sections_html def saveSectionFile(codeinput, filename, inputfiletext): code_instance = models.Code.objects.get(name = codeinput) sectionfile_current = models.SectionFile(
def first(selector, html): res = cs(selector)(html) if not res or not len(res): return None return res[0]