def gatherPostData(soup): # Find post title post_title = clean_html(str(soup.find('a', {'rel':'bookmark'}))) # Find publish date date = clean_html(str(soup.find('div', {'class':'date'}).find('span'))) # Find author author = clean_html(str(soup.find('a', {'rel':'author'}))) # Find number of comments num_comments = int(clean_html(str(soup.find('div', {'class':'comm'}).find('a'))).split()[0].replace(',', '')) return [post_title, date, author, num_comments]
def punctuation_marks(document): text = clean_html(document.content) return { "#dots": text.count("."), "#commas": text.count(","), "#questions": text.count("?"), "#exclamations": text.count("!") }
def spelling_errors(document): d = enchant.Dict("en_US") num = 0 tokens = nltk.word_tokenize(clean_html(document.content)) for token in tokens: if len(token) >= 2 and not d.check(token): num += 1 return { '#spelling_errors': num }
def sentiment(document): classifier = nltk.data.load("classifiers/polarity _NaiveBayes.pickle") words = nltk.word_tokenize(clean_html(document.content)) words_ngrams = reduce(operator.add, [words if n == 1 else ngrams(words, n) for n in [1,2]]) features = dict([(words_ngram, True) for words_ngram in words_ngrams]) polarity = classifier.classify(features) return { "?polarity": (0 if polarity[0] == "neg" else 1) }
def text_complexity(document): word_freq = nltk.FreqDist(w.lower() for w in nltk.word_tokenize(clean_html(document.content))) n = len(word_freq.samples()) c = 0.0 log10_n = math.log10(n) for f in word_freq.items(): c += f[1] * (log10_n - math.log10(f[1])) c = c * (1.0/len(word_freq.samples())) return { '@text_complexity': c }
def __call__(self, env, start_response): # While not nessary, webob makes this easy request = Request(env) # Call up the middleware pipeline response = request.get_response(self.app) # Is the body HTML? (This assumes, our wsgi app is doing sane things # and setting a DOCTYPE) if re.match('\s*?<!DOCTYPE\s*?html', response.body): # If the PATH ends in .tldr if re.search('\.tldr\s*?$', request.path): # Summarize the html response.body = self.summary_html( summarize(self.number, self.context, clean_html(response.body))) return response(env, start_response)
def main(): if len(sys.argv) != 3: print "usage: ./html2text.py <urlfile> <outputdir>" sys.exit(1) with open(sys.argv[1]) as f: urls = [line.rstrip('\n') for line in f] for url in urls: filename = url.rsplit('/',1)[1] + ".txt" response = urllib2.urlopen(url) text = clean_html(response.read()) cleaned_text = re.sub(remove_footnotes, '', text) # wow such race condition if not os.path.exists(sys.argv[2]): os.makedirs(sys.argv[2]) with open(os.path.join(sys.argv[2], filename), 'w') as f: f.write(cleaned_text)
def part_of_speach(document): text = clean_html(document.content) sentences = nltk.sent_tokenize(text) features = {} for sentence in sentences: tokens = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(tokens) for word, tag in tagged: if len(tag) < 2: continue fname = "#" + tag if fname in features: features[fname] += 1 else: features[fname] = 1 return features
def getWebPg(link): # Get url in file format opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] f = opener.open(link) # Read in url and store page page = f.read() f.close() # Parse out text (clean html tags) page = clean_html(page) match = re.search('^\s*References\s*$', page, re.MULTILINE) if match: endPage = match.start() page = page[:endPage] #print page #page = page.decode('utf-8'); return page
# Web scraping from hw5 page_to_scrape = 'http://101books.net/archives/' # A book blog # headers = ["Date", "Title"] # filename = "blog_info_ramram.csv" # readFile = open(filename, "wb") # csvwriter = csv.writer(readFile) # csvwriter.writerow(headers) webpage = urllib2.urlopen(page_to_scrape) soup = BeautifulSoup(webpage.read()) soup.prettify() # For title table titles = soup.findAll("li", attrs = {'class':"clear"}) for title in titles: t = clean_html(title.find("a")['title']) t = t[18::] print "{0}".format(t.encode('ascii', 'ignore')) title_array = [] author_array = ['Robert']*25 # p.s. The code for scraping the author name works but doesn't return a very clean string, that's why i generated the array like this for i in range(25): title = titles[i] t = clean_html(title.find("a")['title'])[18::] t = t.encode('ascii', 'ignore') title_array.append(t) # print title_array # print author_array #Connect to the local database
def _clean_content(self): raw_content_text = self._content cleaned_text = clean_html(raw_content_text) raw_content = sub(r'\w+ \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2} ', '', cleaned_text) tiny_raw_content = raw_content.lower() return tiny_raw_content
# Extract author authors = soup.findAll("span", attrs={'class':'author vcard'}) # Extract url urls = soup.findAll("h2") # Extract post_title post_titles = soup.findAll("h2", attrs={'class': 'entry-title'}) # Extract comment count comments_count = soup.findAll("a", href=re.compile('comments$')) for i in range(48)[::-1]: # by adding [::-1], I sort the posts in a chronological way. author = authors[i] a = clean_html(str(author)) date = post_dates[i] d = clean_html(str(date)) url = urls[i] u = clean_html(str(url.find("a")["href"])) title = post_titles[i] t = clean_html(str(title.find("a"))) count = comments_count[i] c = clean_html(str(count)) # is_post: a boolean value that is 1 if your crawler thinks the page is a post if authors[i] != "" and post_titles[i] != "" and urls[i] !="": p = 1 else: p = 0 csvwriter.writerow([d, a, u, t, c, p]) readFile.close()
soup.prettify() # Extract posts post_entries = soup.findAll("div", attrs={'id':re.compile('^post')}) is_post = [] post_date = [] post_author = [] post_url = [] post_titles = [] # First loop is for the homepage! for i in post_entries: temp_is_post = True temp_titles = clean_html(str(i.find("a"))) temp_date = i.find("div", attrs={'class':'date'}) temp_date = clean_html(str(temp_date)) temp_author = i.find("a", attrs={'rel':'author'}) temp_author = clean_html(str(temp_author)) temp_url = i.find("a", attrs={'class' : 'more-link'}).get("href") is_post.append(temp_is_post) post_date.append(temp_date) post_author.append(temp_author) post_url.append(temp_url) post_titles.append(temp_titles) # Scrap from 2nd page and on.... n = 1 for j in range(2, 536):
#!/usr/bin/env python import sys from nltk.util import clean_html if (len(sys.argv)) !=3: sys.stderr.write('Usage <input file> <output file>!\n') sys.exit(1) html = '' inpFile = open(sys.argv[1], 'r'); outFile = open(sys.argv[2], 'w'); for line in inpFile.readlines(): html = html + line outFile.write(clean_html(html) + '\n')
webpage = urllib2.urlopen(page_to_scrape) # Parse it soup = BeautifulSoup(webpage.read()) soup.prettify() Articles = soup.findAll("div", attrs={'class':'hnews hentry item'}) for article in Articles: observation = [] for item in article.findAll("span", {"id": True}): #Determine if post if item["id"] == "mainentry": observation.append(1) else: observation.append(0) observation.append(clean_html(str(article.find("span", attrs={"class": "timestamp"})))) #Post date observation.append(clean_html(str(article.find("span", attrs={"class": "author vcard"})))) #Authors past_urls = [] for item in article.fetch("a"): #URL temp_url = item["href"] if temp_url[-10:] == "/#comments": continue elif temp_url[0:7] != "http://": continue elif str(temp_url) in past_urls: continue else: observation.append(str(temp_url)) past_urls.append(str(temp_url))
# Parse it soup = BeautifulSoup(webpage.read()) soup.prettify() # Extract petitions on page dates = soup.findAll("h2", attrs={"class":"date-header"}) posts = soup.findAll("div", attrs={"class":"post hentry uncustomized-post-template"}) urls = soup.findAll("h3", attrs={"class":"post-title entry-title"}) titles = soup.findAll("h3", attrs={"class":"post-title entry-title"}) comments = soup.findAll("span", attrs={"class":"post-comment-link"}) authors = soup.findAll("a", attrs={"class":"g-profile"}) for i in range(len(dates)): post = posts[i] p = 0 if clean_html(str(post.find("a")["name"])): p = 1 date = dates[i] d = clean_html(str(date.find("span"))) author = authors[i] a = clean_html(str(author.find("span"))) url = urls[i] u = clean_html(str(url.find("a")["href"])) title = titles[i] t = clean_html(str(title.find("a"))) comment = comments[i] c = clean_html(str(comment.find("a"))).split()[0] if c == "No": c = 0 table_source = Source("Ph D talk", 'http://phdtalk.blogspot.com/') session.add(table_source) table_scrape = Scrapes(p,d,u,t,a,c) table_source.scrape.append(table_scrape)
with con: cur = con.cursor() cur.execute("SELECT id, date_txt, text FROM Bioworld_Today") #cur.execute("SELECT id, date_txt, text FROM Bioworld_Today WHERE date > '1990-12-31' and date < '1993-01-01'") articles = cur.fetchall() # build in order!!! documents = {} for idx, article in enumerate(articles): print "%d of %d" %(idx, len(articles)) aid, date_txt, html = article d = parser.parse(date_txt) #year = str(date).split("-")[0] if d in documents: documents[d].append(clean_html(html.decode('utf8'))) else: documents[d] = [clean_html(html.decode('utf8'))] dates = documents.keys() dates.sort() years = {} raw = "" for date in dates: raw += "\n\n-----"+str(date)+"-----\n\n"+"\n\n".join(documents[date]) # build big text year = date.strftime("%Y") if year in years: years[year] += documents[date]
def cleanXML(text): from nltk.util import clean_html return clean_html(text)
#so add -c to not dl if remote same size os.system("wget -r -c -l0 -t1 -N -np -A.html,shtml -erobots=off http://www.thelatinlibrary.com/indices.html") print "Removing indices and other non-Latin files ..." os.system("rm -r www.thelatinlibrary.com/101/ www.thelatinlibrary.com/imperialism/ www.thelatinlibrary.com/ll2/ www.thelatinlibrary.com/law/ www.thelatinlibrary.com/romhist/ www.thelatinlibrary.com/satire/ www.thelatinlibrary.com/sallust/ www.thelatinlibrary.com/historians/ www.thelatinlibrary.com/certamen/ www.thelatinlibrary.com/caligula/ www.thelatinlibrary.com/caes/ www.thelatinlibrary.com/apul/ www.thelatinlibrary.com/august.html www.thelatinlibrary.com/ammianus.html www.thelatinlibrary.com/alanus.html www.thelatinlibrary.com/apicius.html www.thelatinlibrary.com/albertanus.html www.thelatinlibrary.com/albertofaix.html www.thelatinlibrary.com/alcuin.html www.thelatinlibrary.com/avienus.html www.thelatinlibrary.com/appverg.html www.thelatinlibrary.com/arnobius.html www.thelatinlibrary.com/apuleius.html www.thelatinlibrary.com/aquinas.html www.thelatinlibrary.com/alice.html www.thelatinlibrary.com/ausonius.html www.thelatinlibrary.com/abelard.html www.thelatinlibrary.com/about.html www.thelatinlibrary.com/anselm.html www.thelatinlibrary.com/addison.html www.thelatinlibrary.com/aug.html www.thelatinlibrary.com/ambrose.html www.thelatinlibrary.com/egeria.html www.thelatinlibrary.com/hyginus.html www.thelatinlibrary.com/iordanes.html www.thelatinlibrary.com/epubs.html www.thelatinlibrary.com/erasmus.html www.thelatinlibrary.com/decl.html www.thelatinlibrary.com/des.html www.thelatinlibrary.com/eutropius.html www.thelatinlibrary.com/florus.html www.thelatinlibrary.com/forsett.html www.thelatinlibrary.com/frame1.html www.thelatinlibrary.com/frame2.html www.thelatinlibrary.com/frontinus.html www.thelatinlibrary.com/commodianus.html www.thelatinlibrary.com/curtius.html www.thelatinlibrary.com/dante.html www.thelatinlibrary.com/contemp.html www.thelatinlibrary.com/cred.html www.thelatinlibrary.com/fulgentius.html www.thelatinlibrary.com/gaius.html www.thelatinlibrary.com/gellius.html www.thelatinlibrary.com/gestafrancorum.html www.thelatinlibrary.com/celtis.html www.thelatinlibrary.com/corvinus.html www.thelatinlibrary.com/godfrey.html www.thelatinlibrary.com/bultelius.html www.thelatinlibrary.com/claudian.html www.thelatinlibrary.com/cassiodorus.html www.thelatinlibrary.com/bible.html www.thelatinlibrary.com/caes.html www.thelatinlibrary.com/columba.html www.thelatinlibrary.com/campion.html www.thelatinlibrary.com/capellanus.html www.thelatinlibrary.com/columella.html www.thelatinlibrary.com/cato.html www.thelatinlibrary.com/certamen.html www.thelatinlibrary.com/christian.html www.thelatinlibrary.com/cic.html www.thelatinlibrary.com/classics.html www.thelatinlibrary.com/boethiusdacia.html www.thelatinlibrary.com/bede.html www.thelatinlibrary.com/bennett.html www.thelatinlibrary.com/bernardcluny.html www.thelatinlibrary.com/balde.html www.thelatinlibrary.com/bacon.html www.thelatinlibrary.com/manilius.html www.thelatinlibrary.com/miscmed.html www.thelatinlibrary.com/nemesianus.html www.thelatinlibrary.com/martial.html www.thelatinlibrary.com/malaterra.html www.thelatinlibrary.com/neo.html www.thelatinlibrary.com/nepos.html www.thelatinlibrary.com/marcellinus.html www.thelatinlibrary.com/liberpontificalis.html www.thelatinlibrary.com/may.html www.thelatinlibrary.com/medieval.html www.thelatinlibrary.com/melancthon.html www.thelatinlibrary.com/mirandola.html www.thelatinlibrary.com/misc.html www.thelatinlibrary.com/modinst.html www.thelatinlibrary.com/newton.html www.thelatinlibrary.com/leo.html www.thelatinlibrary.com/nithardus.html www.thelatinlibrary.com/lhomond.html www.thelatinlibrary.com/notitia.html www.thelatinlibrary.com/luther.html www.thelatinlibrary.com/phaedrus.html www.thelatinlibrary.com/lactantius.html www.thelatinlibrary.com/martinofbraga.html www.thelatinlibrary.com/leges.html www.thelatinlibrary.com/mapps.html www.thelatinlibrary.com/lucan.html www.thelatinlibrary.com/lucretius.html www.thelatinlibrary.com/orosius.html www.thelatinlibrary.com/ovid.html www.thelatinlibrary.com/ottofreising.html www.thelatinlibrary.com/papal.html www.thelatinlibrary.com/pascoli.html www.thelatinlibrary.com/patricius.html www.thelatinlibrary.com/pauldeacon.html www.thelatinlibrary.com/landor.html www.thelatinlibrary.com/leothegreat.html www.thelatinlibrary.com/liv.html www.thelatinlibrary.com/justin.html www.thelatinlibrary.com/justinian.html www.thelatinlibrary.com/juvenal.html www.thelatinlibrary.com/jerome.html www.thelatinlibrary.com/janus.html www.thelatinlibrary.com/sedulius.html www.thelatinlibrary.com/sall.html www.thelatinlibrary.com/ter.html www.thelatinlibrary.com/solinus.html www.thelatinlibrary.com/ritchie.html www.thelatinlibrary.com/sabinus.html www.thelatinlibrary.com/sidonius.html www.thelatinlibrary.com/sannazaro.html www.thelatinlibrary.com/sigebert.html www.thelatinlibrary.com/williamtyre.html www.thelatinlibrary.com/sen.html www.thelatinlibrary.com/tertullian.html www.thelatinlibrary.com/seneca.html www.thelatinlibrary.com/sha.html www.thelatinlibrary.com/vallauri.html www.thelatinlibrary.com/silius.html www.thelatinlibrary.com/waltarius.html www.thelatinlibrary.com/spinoza.html www.thelatinlibrary.com/statius.html www.thelatinlibrary.com/suet.html www.thelatinlibrary.com/sulpiciusseverus.html www.thelatinlibrary.com/tac.html www.thelatinlibrary.com/theodosius.html www.thelatinlibrary.com/tib.html www.thelatinlibrary.com/valeriusflaccus.html www.thelatinlibrary.com/vitruvius.html www.thelatinlibrary.com/readme2005.html www.thelatinlibrary.com/readme2007.html www.thelatinlibrary.com/richerus.html www.thelatinlibrary.com/readme2006.html www.thelatinlibrary.com/readme1999.html www.thelatinlibrary.com/readme.html www.thelatinlibrary.com/readme2000.html www.thelatinlibrary.com/readme1998.html www.thelatinlibrary.com/readme2001.html www.thelatinlibrary.com/readme2002.html www.thelatinlibrary.com/readme2003.html www.thelatinlibrary.com/readme2004.html www.thelatinlibrary.com/quintilian.html www.thelatinlibrary.com/livius/ www.thelatinlibrary.com/livy/liv.per.shtml www.thelatinlibrary.com/plautus.html www.thelatinlibrary.com/pliny.html www.thelatinlibrary.com/pliny1.html www.thelatinlibrary.com/augustine/serm.shtml www.thelatinlibrary.com/cicero/adbrutum.shtml www.thelatinlibrary.com/cicero/cat.shtml www.thelatinlibrary.com/cicero/fam.shtml www.thelatinlibrary.com/cicero/fin.shtml www.thelatinlibrary.com/cicero/inventione.shtml www.thelatinlibrary.com/cicero/fratrem.shtml www.thelatinlibrary.com/cicero/leg.shtml www.thelatinlibrary.com/cicero/legagr.shtml www.thelatinlibrary.com/cicero/oratore.shtml www.thelatinlibrary.com/cicero/phil.shtml www.thelatinlibrary.com/cicero/off.shtml www.thelatinlibrary.com/cicero/repub.shtml www.thelatinlibrary.com/cicero/tusc.shtml www.thelatinlibrary.com/cicero/ver.shtml www.thelatinlibrary.com/cicero/nd.shtml www.thelatinlibrary.com/cicero/epis.shtml www.thelatinlibrary.com/virgil/index.html www.thelatinlibrary.com/varro.html www.thelatinlibrary.com/valmax.html www.thelatinlibrary.com/prop.html www.thelatinlibrary.com/Voc.html www.thelatinlibrary.com/Vocab.html www.thelatinlibrary.com/Vocab2.html www.thelatinlibrary.com/tertullian/tertullian.cultu.shtml www.thelatinlibrary.com/tertullian/tertullian.marcionem.shtml www.thelatinlibrary.com/tertullian/tertullian.nationes.shtml www.thelatinlibrary.com/tertullian/tertullian.uxor.shtml www.thelatinlibrary.com/prud.html www.thelatinlibrary.com/pomponius.html www.thelatinlibrary.com/sedulius.html www.thelatinlibrary.com/vegetius.html www.thelatinlibrary.com/vell.html www.thelatinlibrary.com/verg.html www.thelatinlibrary.com/addison.html www.thelatinlibrary.com/albertanus.html") print "Stripping HTML and changing extensions to .txt ..." for r,d,f in os.walk("www.thelatinlibrary.com"): for files in f: if files.endswith("html"): path = os.path.join(r,files) opened = open(path, 'r') readed = opened.read() opened.close() new_opened = open(path, "w") new_opened.write(clean_html(readed)) new_opened.close() fileName, fileExtension = os.path.splitext(path) os.rename(fileName + fileExtension, fileName + ".txt") print "Creating Public Domain LICENSE ..." os.system("touch www.thelatinlibrary.com/LICENSE.md") os.system("printf 'Public Domain Mark 1.0\n----------------------\n### No Copyright\nThis work has been identified as being free of known restrictions under copyright law, including all related and neighboring rights.\n\nYou can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission. See Other Information below.\n\n### Other Information\n- The work may not be free of known copyright restrictions in all jurisdictions.\n- Persons may have other rights in or related to the work, such as patent or trademark rights, and others may have rights in how the work is used, such as publicity or privacy rights.\n- In some jurisdictions moral rights of the author may persist beyond the term of copyright. These rights may include the right to be identified as the author and the right to object to derogatory treatments.\n- Unless expressly stated otherwise, the person who identified the work makes no warranties about the work, and disclaims liability for all uses of the work, to the fullest extent permitted by applicable law.\n- When using or citing the work, you should not imply endorsement by the author or the person who identified the work.\n\nA copy of this Mark is available at: <https://creativecommons.org/publicdomain/mark/1.0/>.' >> LICENSE.md") print "Creating README.md ..." os.system("touch www.thelatinlibrary.com/README.md") os.system('printf "About the Latin Library\n=======================\n\nThe Latin Library is a collection of a wide variety of texts from the archaic period to the modern era. Altogether the corpus is about 108 MB.\n\nThese files are in the public domain, [as explained here](http://thelatinlibrary.com/about.html). For a declaration of their status in public domain, see LICENSE.md." >> README.md') print "Renaming corpus to thelatinlibrary ..." os.system("mv www.thelatinlibrary.com thelatinlibrary")
# Parse it soup = BeautifulSoup(webpage.read()) soup.prettify() # extract all post_titles, publish_dates, comment_counts, urls post_title = soup.findAll("h2", attrs={'class':'entry-title'}) publish_date = soup.findAll("div", attrs={'class':'postMeta'}) comment_count = soup.findAll("p", attrs={'class':'container'}) urls = soup.findAll("h2", class_="entry-title") for i in range(10): date = publish_date[i] d = clean_html(str(date.find("span", attrs={'class':'date'}))) # d_clean = d.encode('ascii', 'ignore') d_clean = d.decode('utf-8') url = urls[i] u = url('a')[0]['href'] # u_clean = u.encode('ascii', 'ignore') u_clean = u.decode('utf-8') post = post_title[i] p = clean_html(str(post.find("a"))) # p_clean = p.encode('ascii', 'ignore') p_clean = p.decode('utf-8') comment = comment_count[i] c = clean_html(str(comment.find("span", attrs={'class':'comments'})))
def remove_html(text): #取出html的tag和url return clean_html(text)
links.append(link.find("a").get('href')) for link in soup.findAll("div", attrs = {'class':'pagenav clearfix'}): for ref in link.findAll("a"): #not sure how this could've been not a nested for loop. links.append(ref.get("href")) #add all links on current page that haven't been visited and aren't already on the list #inspiration for this step taken from David Carlson's code for link in links: if (link not in links2) and (link not in visited): links2.append(link) if len(soup.findAll(content = "article")) != 0: #is it a post? If so add info and if not move to next is_post = True date = soup.findAll("time", attrs = {'class':"post-date"}) for i in date: post_date = clean_html(str(i)) author = soup.findAll("a", attrs = {'rel':"author"}) for i in author: post_author = clean_html(str(i)) title = soup.findAll("h1", attrs = {'class':'post-title'}) for i in title: post_title = clean_html(str(i)) comments = soup.findAll("span", attrs = {'class':'post-comment'}) for i in comments: post_comments = clean_html(str(i)) if post_comments == "No Comments": post_comments = 0 csvwriter.writerow([page_to_scrape, is_post, post_date, post_author, post_title, post_comments]) #Write these to the CSV else: csvwriter.writerow([page_to_scrape, False, None, None, None, None]) #Write these to the CSV
# Parse it soup = BeautifulSoup(webpage.read()) soup.prettify() # Links to posts links = soup.findAll("li", attrs = {'class':"clear"}) l = [] for link in links: l.append(str(link.find("a")['href'])) # print l # Is post? posts = soup.findAll("li", attrs = {'class':"clear"}) print len(posts) for post in posts: p = clean_html(str(post.find("a")['rel'])) print p # Date dates = soup.findAll("li", attrs = {'class':"clear"}) print len(dates) for date in dates: d = clean_html(str(date.find("span"))) print d # Title titles = soup.findAll("li", attrs = {'class':"clear"}) print len(titles) for title in titles: t = clean_html(title.find("a")['title']) t = t[18::]
# links = [] # for tag in tags: # links.append(tag['href']) # print links # # for i in links: # webpages[i]=urllib2.urlopen(links[i]) # must be indices not unicode # time.sleep(1) # soups = BeatifulSoup(webpages.read()) # Is post? posts = soup.findAll("div", attrs = {'class':"entry"}) print len(posts) for post in posts: p = clean_html(str(post.find("a")['rel'])) print p # Author # authors = soup.findAll("div", attrs = {'class': "single-post-meta"}) # for author in authors: # a = clean_html(str(author.find("div"))) # print a # Date dates = soup.findAll("div", attrs = {'class':"post-date"}) print len(dates) for date in dates: d = clean_html(str(date.find("p"))) print d
if __name__ == '__main__': import sys from nltk.util import clean_html print sys.argv inHTML = sys.argv[1] outText = sys.argv[2] fd = open(inHTML) ct = fd.read() fd.close() text = clean_html(ct) fd = open(outText, 'w') fd.write(text) fd.close()
webpage = urllib2.urlopen(page_to_scrape) # Open homepage soup = BeautifulSoup(webpage.read()) # Parse homepage soup.prettify() flight_deals = soup.findAll("a", attrs={'rel':'bookmark'}) # Find first post from the homepage first_deal = flight_deals[0] new_url = first_deal.get('href') print new_url webpage = urllib2.urlopen(new_url) # Open the first post page soup = BeautifulSoup(webpage.read()) soup.prettify() url_div = soup.find("div", attrs={'class':'prev_next'}) previous_checker = clean_html(str(url_div.find("p"))) flights_array = [] # Array used to store info from each post before putting into CSV file while "Previous post" in previous_checker: # Each post's html links to "previous post", except the final one # So, while loop continues until the final post on the blog post_page = urllib2.urlopen(new_url) # Open the webpage with the blog post soup = BeautifulSoup(post_page.read()) # Parse the webpage soup.prettify() title_of_deal = soup.find("title") # Extract title of deal on page clean_title = clean_html(str(title_of_deal)) date_deal_posted = soup.find("p", attrs={'class':'headline_meta'}) # Extract date of deal post clean_date = clean_html(str(date_deal_posted))
def open_html_file(self, html_file): html_and_text = urlopen(html_file).read() raw_text_and_space = clean_html(html_and_text) return raw_text_and_space
time.sleep(2) # Extract the author authors = soup.findAll("div", attrs={'class':'blog-byline'}) time.sleep(2) # Extract the time and the date dates = soup.findAll("span", attrs={'class':'updated'}) # Indicate that the page is a post if articles are listed in reverse chronological order dates_list = [] for i in range(95): date = dates[i] date = clean_html(str(date)) # Clean up the markup date = date.split(", ")[1::] # Get rid of the time part date = ''.join(date) # Join month, day, and year as a string date = date.replace('/','') # Get rid of the forward slashes date = int(date) # Make it an integer, which should be bigger for later dates in July 2014 dates_list.append(date) posts_list = [] for i in range(94): if dates_list[i] >= dates_list[i+1]: # Compare two contiguous dates posts_list.append(True) if i + 1 == 94: # Add True to the last article if the previous article got True posts_list.append(True) else: posts_list.append(False) if i + 1 == 94: # Add False to the last article if the previous article got False posts_list.append(False)
# Extract date dates = soup.findAll("span", attrs={'class':'postdate'}) # Extract author authors = soup.findAll("span", attrs={'class':'postauthor'}) # Extract url urls = soup.findAll("h2", attrs={'class':'posttitle'}) # Extract comments comments = soup.findAll("span", attrs={'class':'postcomment'}) for i in range(len(titles)): title = titles[i] t = clean_html(str(title.find("a"))) date = dates[i] d = clean_html(str(date)) d = datetime.strptime(d, "%d %B %Y, %I:%M %p") #converts it into standard date time format a = authors[i] a = clean_html(str(a.find("a"))) url = urls[i] u = url.find("a").get("href") comment = comments[i] c = clean_html(str(comment.find("a"))) c = re.findall(r"\d\S*", c) if c == []: c = 0 else: c = int(c[0]) ispost = False if (len(t) != 0) & (len(str(d)) != 0) & (len(u) != 0) & (isinstance(c, int)): ispost = True csvwriter.writerow([ispost, d, a, u, t, c])
csvwriter = csv.writer(readFile) csvwriter.writerow(headers) # Open webpage webpage = urllib2.urlopen(page_to_scrape) # Parse it soup = BeautifulSoup(webpage.read()) soup.prettify() # Extract petitions on page #petitions = soup.findAll("a", href=re.compile('^/petition')) petitions = soup.findAll("div", attrs={'class':'title'}) print len(petitions) for petition in petitions: p = clean_html(str(petition.find("a"))) print p signatures = soup.findAll("div", attrs={'class':'num-sig'}) print len(signatures) for signature in signatures: s = clean_html(str(signature.find("span", attrs={'class':'num'}))) print s for i in range(20): petition = petitions[i] p = clean_html(str(petition.find("a"))) signature = signatures[i] s = clean_html(str(signature.find("span", attrs={'class':'num'}))) csvwriter.writerow([p, s])
with con: cur = con.cursor() cur.execute("SELECT id, date_txt, text FROM Bioworld_Today") #cur.execute("SELECT id, date_txt, text FROM Bioworld_Today WHERE date > '1990-12-31' and date < '1993-01-01'") articles = cur.fetchall() # build in order!!! documents = {} for idx, article in enumerate(articles): print "%d of %d" % (idx, len(articles)) aid, date_txt, html = article d = parser.parse(date_txt) #year = str(date).split("-")[0] if d in documents: documents[d].append(clean_html(html.decode('utf8'))) else: documents[d] = [clean_html(html.decode('utf8'))] dates = documents.keys() dates.sort() years = {} raw = "" for date in dates: raw += "\n\n-----" + str(date) + "-----\n\n" + "\n\n".join( documents[date]) # build big text year = date.strftime("%Y") if year in years: years[year] += documents[date] else:
readFile = open(filename, "wb") csvwriter = csv.writer(readFile) csvwriter.writerow(headers) # Open webpage webpage = urllib2.urlopen(page_to_scrape) # Parse it soup = BeautifulSoup(webpage.read()) soup.prettify() # Extract titles on page titles = soup.findAll("h2", attrs={'class':'entry-title'}) #print len(titles) for title in titles: p = clean_html(str(title.find("a"))) # print p authors = soup.findAll("div", attrs={'class':'blog-byline'}) #print len(authors) for author in authors: s = clean_html(str(author)) s = "".join(s.split("By ")[1::]) # print s dates = soup.findAll("span", attrs={'class':'timestamp'}) #print len(dates) for date in dates: d = clean_html(str(date)) d = "".join(d.split("Posted at ")[1::]) # print d