def summarize(query=None, k=4, url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [ word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word ] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [ sentence for sentence in sentences if len(sentence) > 1 and sentence != '' ] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict( (v[0], v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary) - (k):])
def summarize(query=None, k=4,url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary)-(k):])
def main(): table = Datasheet() tel = '' street = '' locality = '' title = '' for i in range(3): page = i+1 url = URL("http://torino.paginegialle.it/pgol/4-veterinari/3-torino/p-%s?mr=50" % page) print "collecting from %s" % url connection = url.open() doc = Document( connection.read() ) items = doc.by_class('item_sx') row = [] for j, item in enumerate(items): divs = item.by_class('address') try: title = item.by_class('item_head')[0].by_tag('a')[0].content except IndexError, e: print >> sys.stderr, "%s" % j, e pass for z, div in enumerate(divs): if div != None: try: street = div.by_class('street-address')[0].content locality = div.by_class('locality')[0].content tel = div.by_class('tel')[0].by_class('value')[0].content except IndexError, e: print >> sys.stderr, "%s" % z, e pass save = "%s, %s %s, %s \n" % ( plaintext(title), plaintext(street).replace(",", ""), plaintext(locality).replace('(TO)', ''), plaintext(tel).replace(",", "") ) print >> sys.stderr, save row.append(save)
def summarize_evaluation(query=None, url=None, summary=None): j=[] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa.parse(sentence) else: lsa = LSA(stopwords, ignore_characters) for sentence in query: lsa.parse(sentence) lsa.build() lsa.calc() lsa2 = LSA(stopwords, ignore_characters) for sentence in summary: lsa2.parse(sentence) lsa2.build() lsa2.calc() vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))] vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))] angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2] return str(abs(1 - float(angles[1])/float(pi/2)))
def heuristic_scrape(article): from pattern.web import URL, Document, HTTP404NotFound, URLError, plaintext try: s_content = URL(article).download() except (URLError, HTTP404NotFound): print "Error downloading article" return ("could not download", article) dom = Document(s_content) text = '' for node in dom.by_tag('p'): for c in node: if c.type == 'text': text = text + ' ' + plaintext(c.source()) return text.strip()
def main(): table = Datasheet() for cap in CAPS: url = URL("http://www.comuni-italiani.it/001/272/farmacie/cap%s.html" % cap) connection = url.open() doc = Document( connection.read() ) items = doc.by_tag("table") row = [] for j, td in enumerate( items[5].by_tag('td') ): strcap = "%s, Telefono:" % cap save = "%s" % plaintext(td.content).replace('\n', ',', 3).replace("Telefono:", strcap).replace(";", "").replace("Partita Iva", ",Partita Iva") + "\n" if save != None: row.append( save ) table.append( row ) print "%s ----------------------------------------------------------------------------" % str(j) table.save("files/farmacie_torino.txt")
def main(): table = Datasheet() url = URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html") connection = url.open() doc = Document( connection.read() ) items = doc.by_class('ulamm')[1:] row = [] for ul in items: li = ul.by_tag('li') kind = plaintext(ul.previous.content) for el in li: if el != None: save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, ) row.append(save) table.append( row ) table.save("files/h_torino.txt")
def summarize_evaluation(query=None, url=None, summary=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [ word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word ] j = ' '.join(j) lsa = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [ sentence for sentence in sentences if len(sentence) > 1 and sentence != '' ] for sentence in sentences: lsa.parse(sentence) else: lsa = LSA(stopwords, ignore_characters) for sentence in query: lsa.parse(sentence) lsa.build() lsa.calc() lsa2 = LSA(stopwords, ignore_characters) for sentence in summary: lsa2.parse(sentence) lsa2.build() lsa2.calc() vectors = [(dot(lsa.S, lsa.U[0, :]), dot(lsa.S, lsa.U[i, :])) for i in range(len(lsa.U))] vectors2 = [(dot(lsa2.S, lsa2.U[0, :]), dot(lsa2.S, lsa2.U[i, :])) for i in range(len(lsa2.U))] angles = [ arccos(dot(a, b) / (norm(a, 2) * norm(b, 2))) for a in vectors for b in vectors2 ] return str(abs(1 - float(angles[1]) / float(pi / 2)))
def get_dom(url): try: s_content = URL(url).download(timeout=120, cached=False) except (URLError, HTTP404NotFound): print "Error downloading article" return None #for AJE compatibility try: s_content = s_content.decode('unicode_escape') except (UnicodeEncodeError): pass return Document(s_content)
for p in ("-", "-web", "-db", "-search", "-vector", "-graph", "-canvas", "-metrics", "-de", "-en", "-es", "-fr", "-it", "-nl", "-shell", "stop-words", "mbsp-tags", "-dev"): # We include some useful pages (Penn Treebank tags, stop words) referenced in the documentation. if p.startswith("-"): p = "pattern" + p.rstrip("-") title = p.replace("-", ".") if p == "stop-words": title = "Stop words" if p == "mbsp-tags": title = "Penn Treebank II tag set" # Download the online documentation pages. print "Retrieving", url + p html = URL(url + p).download(cached=False) # Parse the actual documentation, we don't need the website header, footer, navigation, search. html = Document(html) html = html.by_id("content-area") html = html.by_class("node-type-page")[0] html = html.source html = strip_javascript(html) html = strip_between('<div id="navbar">', '/#navbar -->', html) html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html) html = strip_between('<div id="footer">', '/#footer -->', html) html = strip_between('<a class="twitter-share-button"', '</a>', html) # Link to local pages and images. # Link to online media. html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url) # MBSP docs (online) html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(using-.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(modeling-.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html) # pages (offline)
import os, sys sys.path.insert(0, os.path.join("..", "..")) from pattern.web import URL, Document, plaintext from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT # The web module has a number of convenient search engines, # but often you will need to handle the HTML in web pages of your interest manually. # The Document object can be used for this, similar to the Javascript DOM. # For example: url = URL("http://www.reddit.com/top/") dom = Document(url.download(cached=True)) for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries. for a in e.get_elements_by_tagname( "a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attributes["href"] print # Some of the links can be relative, for example starting with "../". # We can get the absolute URL by prepending the base URL. # However, this might get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in Document(url.download()).by_tag("a"): link = link.attributes.get("href", "") link = abs(link, base=url.redirect or url.string) #print link
import os, sys; sys.path.insert(0, os.path.join("..", "..")) from pattern.web import URL, Document, plaintext from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT # The web module has a number of convenient search engines, # but often you will need to handle the HTML in web pages of your interest manually. # The Document object can be used for this, similar to the Javascript DOM. # For example: url = URL("http://www.reddit.com/top/") dom = Document(url.download(cached=True)) print dom.body.content.__class__ for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries. for a in e.get_elements_by_tagname("a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attributes["href"] print # Some of the links can be relative, for example starting with "../". # We can get the absolute URL by prepending the base URL. # However, this might get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in Document(url.download()).by_tag("a"): link = link.attributes.get("href","") link = abs(link, base=url.redirect or url.string) #print link # The Document object is a tree of Element and Text objects.
angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a,b in vectors[1:]] return str(abs(1 - float(angles[0])/float(pi/2))) def graph(query1, query2): lsa = LSA(stopwords, ignore_characters) titles = [lsa.search_wiki(query1), lsa.search_wiki(query2)] for t in titles: lsa.parse(t) lsa.build() lsa.calc() lsa.plotSVD() ## core summarization function. def summarize(query=None, k=4,url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build()