示例#1
0
def summarize(query=None, k=4, url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2))
               for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict(
        (v[0], v)
        for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary) - (k):])
示例#2
0
文件: lsa.py 项目: pegasos1/pyLSA
def summarize(query=None, k=4,url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary)-(k):])
示例#3
0
def main():
	table = Datasheet()
	tel = ''
	street = ''
	locality = ''
	title = ''
	for i in range(3):
		page = i+1
		url = 	URL("http://torino.paginegialle.it/pgol/4-veterinari/3-torino/p-%s?mr=50" % page)
		print "collecting from %s" % url
		connection = url.open()
		doc = Document( connection.read() )
		items = doc.by_class('item_sx')
		row = []
		for j, item in enumerate(items):
			divs = item.by_class('address')
			try:	
				title = item.by_class('item_head')[0].by_tag('a')[0].content
			except IndexError, e:
				print >> sys.stderr, "%s" % j, e
				pass
			for z, div in enumerate(divs):
				if div != None:
					try:
						street = div.by_class('street-address')[0].content
						locality = div.by_class('locality')[0].content
						tel = div.by_class('tel')[0].by_class('value')[0].content
					except IndexError, e:
						print >> sys.stderr, "%s" % z, e
						pass
					save = "%s, %s %s, %s \n" % ( plaintext(title), plaintext(street).replace(",", ""), plaintext(locality).replace('(TO)', ''), plaintext(tel).replace(",", "") )
					print >> sys.stderr, save
					row.append(save)
示例#4
0
文件: lsa.py 项目: pegasos1/pyLSA
def summarize_evaluation(query=None, url=None, summary=None):
    j=[]
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))]
    vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))]
    angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2]
    return str(abs(1 - float(angles[1])/float(pi/2)))
示例#5
0
def heuristic_scrape(article):
    from pattern.web import URL, Document, HTTP404NotFound, URLError, plaintext
    try:
        s_content = URL(article).download()
    except (URLError, HTTP404NotFound):
        print "Error downloading article"
        return ("could not download", article)

    dom = Document(s_content)
    
    text = ''

    for node in dom.by_tag('p'):
        for c in node:
            if c.type == 'text':
                text = text + ' ' + plaintext(c.source())
    return text.strip()
示例#6
0
def heuristic_scrape(article):
    from pattern.web import URL, Document, HTTP404NotFound, URLError, plaintext
    try:
        s_content = URL(article).download()
    except (URLError, HTTP404NotFound):
        print "Error downloading article"
        return ("could not download", article)

    dom = Document(s_content)

    text = ''

    for node in dom.by_tag('p'):
        for c in node:
            if c.type == 'text':
                text = text + ' ' + plaintext(c.source())
    return text.strip()
示例#7
0
def main():
	table = Datasheet()

	for cap in CAPS:
		url = 	URL("http://www.comuni-italiani.it/001/272/farmacie/cap%s.html" % cap)
		connection = url.open()
		doc = Document( connection.read() )
		items = doc.by_tag("table")
		row = []
		for j, td in enumerate( items[5].by_tag('td') ):
			strcap = "%s, Telefono:" % cap
			save = "%s" % plaintext(td.content).replace('\n', ',', 3).replace("Telefono:", strcap).replace(";", "").replace("Partita Iva", ",Partita Iva") + "\n"
			if save != None:
				row.append( save )
		table.append( row )
		print  "%s ----------------------------------------------------------------------------" % str(j)
		
	table.save("files/farmacie_torino.txt")
示例#8
0
def main():
	table = Datasheet()

	url = 	URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html")
	connection = url.open()
	doc = Document( connection.read() )
	items = doc.by_class('ulamm')[1:]
	row = []
	for ul in items:
		li = ul.by_tag('li')
		kind = plaintext(ul.previous.content)
		for el in li:
			if el != None:
				save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, )
				row.append(save)
	table.append( row )
		
	table.save("files/h_torino.txt")
示例#9
0
def summarize_evaluation(query=None, url=None, summary=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors = [(dot(lsa.S, lsa.U[0, :]), dot(lsa.S, lsa.U[i, :]))
               for i in range(len(lsa.U))]
    vectors2 = [(dot(lsa2.S, lsa2.U[0, :]), dot(lsa2.S, lsa2.U[i, :]))
                for i in range(len(lsa2.U))]
    angles = [
        arccos(dot(a, b) / (norm(a, 2) * norm(b, 2))) for a in vectors
        for b in vectors2
    ]
    return str(abs(1 - float(angles[1]) / float(pi / 2)))
示例#10
0
def get_dom(url):
    
    try:
        s_content = URL(url).download(timeout=120, cached=False)
    except (URLError, HTTP404NotFound):
        print "Error downloading article"
        return None

    #for AJE compatibility
    try:
        s_content = s_content.decode('unicode_escape')
    except (UnicodeEncodeError):
        pass
    
    return Document(s_content)
示例#11
0
for p in ("-", "-web", "-db", "-search", "-vector", "-graph", "-canvas", "-metrics", 
          "-de", "-en", "-es", "-fr", "-it", "-nl", 
          "-shell", "stop-words", "mbsp-tags", "-dev"):
    # We include some useful pages (Penn Treebank tags, stop words) referenced in the documentation.
    if p.startswith("-"):
        p = "pattern" + p.rstrip("-")
        title = p.replace("-", ".")
    if p == "stop-words":
        title = "Stop words"
    if p == "mbsp-tags":
        title = "Penn Treebank II tag set"
    # Download the online documentation pages.
    print "Retrieving", url + p
    html = URL(url + p).download(cached=False)
    # Parse the actual documentation, we don't need the website header, footer, navigation, search.
    html = Document(html)
    html = html.by_id("content-area")
    html = html.by_class("node-type-page")[0]
    html = html.source
    html = strip_javascript(html)
    html = strip_between('<div id="navbar">', '/#navbar -->', html)
    html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html)
    html = strip_between('<div id="footer">', '/#footer -->', html)
    html = strip_between('<a class="twitter-share-button"', '</a>', html)
    # Link to local pages and images.
    # Link to online media.
    html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url)                   # MBSP docs (online)
    html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html)   # examples (online)
    html = re.sub('href="/pages/(using-.*?)"', 'href="%s\\1"' % url, html)             # examples (online)
    html = re.sub('href="/pages/(modeling-.*?)"', 'href="%s\\1"' % url, html)          # examples (online)
    html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html)              # pages (offline)
示例#12
0
文件: 07-dom.py 项目: mlyne/Scripts
import os, sys
sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import URL, Document, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The web module has a number of convenient search engines,
# but often you will need to handle the HTML in web pages of your interest manually.
# The Document object can be used for this, similar to the Javascript DOM.

# For example:
url = URL("http://www.reddit.com/top/")
dom = Document(url.download(cached=True))
for e in dom.get_elements_by_tagname("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.get_elements_by_tagname(
            "a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print

# Some of the links can be relative, for example starting with "../".
# We can get the absolute URL by prepending the base URL.
# However, this might get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in Document(url.download()).by_tag("a"):
    link = link.attributes.get("href", "")
    link = abs(link, base=url.redirect or url.string)
    #print link
示例#13
0
文件: 07-dom.py 项目: daeon/pattern
import os, sys; sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import URL, Document, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The web module has a number of convenient search engines,
# but often you will need to handle the HTML in web pages of your interest manually.
# The Document object can be used for this, similar to the Javascript DOM.

# For example:
url = URL("http://www.reddit.com/top/")
dom = Document(url.download(cached=True))
print dom.body.content.__class__
for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries.
    for a in e.get_elements_by_tagname("a.title")[:1]: # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print
        
# Some of the links can be relative, for example starting with "../".
# We can get the absolute URL by prepending the base URL.
# However, this might get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in Document(url.download()).by_tag("a"):
    link = link.attributes.get("href","")
    link = abs(link, base=url.redirect or url.string)
    #print link

# The Document object is a tree of Element and Text objects.
示例#14
0
    angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a,b in vectors[1:]]
return str(abs(1 - float(angles[0])/float(pi/2)))
def graph(query1, query2):
    lsa = LSA(stopwords, ignore_characters)
    titles = [lsa.search_wiki(query1), lsa.search_wiki(query2)]
for t in titles:
        lsa.parse(t)
    lsa.build()
    lsa.calc()
    lsa.plotSVD()
## core summarization function.
def summarize(query=None, k=4,url=None):
    j = []
if url:
        b = URL(url)
        a = Document(b.download(cached=True))
for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
for sentence in sentences:
            lsa1.parse(sentence)
else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()