def scrape_links_and_wordlistify(links, lower=False, verbose=1): import nltk import requests import string raw = '' wordlist = {} for site in links: try: if verbose == 1: print '[+] fetching data from: ', site if site.find('http://pastebin.com/') == 0: raw = requests.get(site.replace('http://pastebin.com/', 'http://pastebin.com/raw.php?i=')).content else: raw = requests.get(site).content if lower == False: l = string.translate(nltk.clean_html(raw), string.maketrans(string.punctuation, ' ' * 32)).split() freq_an(l, wordlist) else: l = string.lower(nltk.clean_html(raw)) l = string.translate(l, string.maketrans(string.punctuation, ' ' * 32)).split() freq_an(l, wordlist) except: if verbose == 1: print '[-] Skipping url: ', site return wordlist
def extract_content(self,raw): logging.info('Processor.extract_content') soup = BeautifulSoup(raw) cable_table = soup.find("table", { "class" : "cable" }) cable_id = cable_table.findAll('tr')[1].findAll('td')[0]\ .contents[1].contents[0] if db.cables.find_one({'_id':cable_id}): self.counts['files_not_processed'] = self.counts['files_not_processed'] + 1 logging.info('Processor.extract_content["CABLE ALREADY EXISTS"]') self.print_counts() return cable = Cable(raw) cable['_id'] = cable_id cable['reference_id'] = cable_id cable['date_time'] = cable_table.findAll('tr')[1].findAll('td')[1]\ .contents[1].contents[0] cable['classification'] = cable_table.findAll('tr')[1].findAll('td')[2]\ .contents[1].contents[0] cable['origin'] = cable_table.findAll('tr')[1].findAll('td')[3]\ .contents[1].contents[0] cable['header'] = nltk.clean_html(str(soup.findAll(['pre'])[0])) cable['body'] = nltk.clean_html(str(soup.findAll(['pre'])[1])) db.cables.insert(cable.get()) self.counts['files_processed'] = self.counts['files_processed'] + 1 self.print_counts() if (self.counts['files_processed'] + self.counts['files_not_processed'])\ == self.counts['files_to_process']: self.dump_json()
def process_feed(self, entries): abbr = self.abbr feed_entries = db.feed_entries third = itemgetter(2) # Find matching entities in the feed. for entry, matches in self.scan_feed(entries): matches = self.extract_entities(matches) ids = map(third, matches) strings = [m.group() for m, _, _ in matches] assert len(ids) == len(strings) # Add references and save in mongo. entry['state'] = abbr # list probably wiser entry['entity_ids'] = ids or None entry['entity_strings'] = strings or None entry['save_time'] = datetime.datetime.utcnow() entry['_id'] = new_feed_id(entry) entry['_type'] = 'feedentry' entry['summary'] = nltk.clean_html(entry['summary']) try: entry['summary_detail']['value'] = nltk.clean_html( entry['summary_detail']['value']) except KeyError: pass feed_entries.save(entry) msg = 'Found %d related entities in %r' self.logger.info(msg % (len(ids), entry['title']))
def scrapeBlog(url, depth): # obs hackkkkkkkkk allText = "" pages = getPages(url) pages = pages[(depth+1):] # take the rest posts = [] timestamps = [] for url in pages: response = getContent(url) repls = ('januari', 'january'), ('februari', 'february'), ('mars', 'march'), ('maj', 'may'), ('juni', 'june'), ('juli', 'july'), ('augusti', 'august'), ('oktober', 'october') response = reduce(lambda a, kv: a.replace(*kv), repls, response.lower()) soup = BeautifulSoup(response) try: poststext = soup.select(".blogposttext") # get posts text poststext = [nltk.clean_html(unicode(post)) for post in poststext] postsdatetime = soup.select(".blogpostheaderdate") postsdatetime = [nltk.clean_html(unicode(post)) for post in postsdatetime] postsdatetime = [parse(post, fuzzy=True) for post in postsdatetime] posts.extend(poststext[0:len(postsdatetime)]) timestamps.extend(postsdatetime) except: pass #allText = allText + "\n\n" + getAllText(url) return posts, timestamps
def parse(self, fname): try: with open(fname, "r") as f: log.info("Process %s" % fname) soup = BeautifulSoup(f.read()) tbl = soup.find("table", { "class" : "cable" }) docid = tbl.findAll('tr')[1].\ findAll('td')[0].contents[1].contents[0] if docid in self.docids: return True doc = { "_id": docid, "refererence_id": docid, "date_time": tbl.findAll('tr')[1].\ findAll('td')[1].contents[1].contents[0], "classification": tbl.findAll('tr')[1].\ findAll('td')[2].contents[1].contents[0], "origin": tbl.findAll('tr')[1].\ findAll('td')[3].contents[1].contents[0], "header":nltk.clean_html(str(soup.findAll(['pre'])[0])), "body": nltk.clean_html(str(soup.findAll(['pre'])[1])) } return doc except OSError: log.error("Can't open '%s'" % fname) self.processed -= 1
def getKeyList(testID): myDataQ = getData(testID,1) myDataA = getData(testID,0) userKeyQ = getUserAnnotate(myDataQ) userKeyA = getUserAnnotate(myDataA) myCodeListQ = getCodeList(myDataQ) myCodeListA = getCodeList(myDataA) myHtml = getHTML(testID) t1 = [] packQ = [] funcQ = [] for item in myCodeListQ: try: p,f = cparPack(nltk.clean_html(item)) packQ += p funcQ += f except SyntaxError: pass t1 += preProCode(item) fQ,aQ,vQ,cQ = cparFuncs(t1) packQ,funcQ = cparPack(t1) fQ = list(set(fQ)) aQ = list(set(aQ)) vQ = list(set(vQ)) cQ = list(set(cQ)) combQ = [] for cItem in cQ: for fItem in fQ: combQ.append(cItem+"."+fItem) t2 = [] packA = [] funcA = [] for item in myCodeListA: try: p,f = cparPack(nltk.clean_html(item)) packA += p funcA += f except SyntaxError: pass t2 += preProCode(item) fA,aA,vA,cA = cparFuncs(t2) fA = list(set(fA)) aA = list(set(aA)) vA = list(set(vA)) cA = list(set(cA)) combA = [] for cItem in cA: for fItem in fA: combA.append(cItem+"."+fItem) keyList = \ list(set(fQ+fA+aQ+aA+vQ+vA+cQ+cA+combQ+combA+packQ+packA+funcQ+funcA+userKeyQ+userKeyA)) return keyList
def parse_file(self, filepath): """ Parses a corpus file and initialize the object. @param filepath: The path of the corpus file to parse. @type filepath: C{string} """ html_file = codecs.open(filepath, "r", "utf-8") raw_html = html_file.read() body = raw_html.split("<body>",1)[1] raw_content = nltk.clean_html(body.split("</h1>", 1)[1]) self.set_title(nltk.clean_html(body.split("</h1>", 1)[0]).strip() + ".") content = "" for p in raw_content.split("\n"): p = p.strip() if p != "": if content != "": content += " " content += p content = content.split("-", 1)[1].replace(u"\u202F", " ").strip() self.set_content(content) html_file.close()
def getarticle(url): html = urllib2.urlopen(url) soup = BeautifulSoup(html,from_encoding="utf-8") titletag = soup.find("h2") title = nltk.clean_html("{0}".format(titletag)) ptags = soup.find_all("p") text = nltk.clean_html("{0}".format(ptags[2])) return title,text
def getarticle(url): html = urllib2.urlopen(url) soup = BeautifulSoup(html,from_encoding="utf-8") titletag = soup.find("h2") title = nltk.clean_html("{0}".format(titletag)) storytag = soup.findAll('div',{'class':None})[1] text = nltk.clean_html("{0}".format(storytag)) return title,text
def extrait(self, rss): d = feedparser.parse(rss) h = random.randint(0, len(d['entries']) -1) print h print str(len(d['entries'])) titre = nltk.clean_html(d['items'][h].title) descriptionb = nltk.clean_html(d['items'][h].description) description = re.sub("&#(\d+);", lambda m: chr(int(m.group(1))), descriptionb) return titre+". \n\n"+description
def preprocess_hotel_review(file_contents, file_contents_test): """ Hotel review preprocess and truthfulness of the hotel review :param file_contents: :param file_contents_test: """ raw = clean_html(file_contents) raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw) sentence_list = tokenize.line_tokenize(raw) print sentence_list truth_sentences = [] false_sentences = [] for sentence in sentence_list: sent_arr = re.split(r',', sentence) try: is_truthful = int(sent_arr[0]) except ValueError: print "is_truthful is not an integer" if is_truthful == 1: truth_sentences.append(sent_arr[2]) elif is_truthful == 0: false_sentences.append(sent_arr[2]) truth_uni_prob_dict, truth_bi_prob_dict = process_prob(" ".join(truth_sentences)) false_uni_prob_dict, false_bi_prob_dict = process_prob(" ".join(false_sentences)) raw_test = clean_html(file_contents_test) raw_test = re.sub(r'IsTruthFul,review', "", raw_test) sentence_list_test = tokenize.line_tokenize(raw_test) test_list = [] test_truth_false_list = [] truth_count = false_count = i = 0 for sentence in sentence_list_test: sent_arr = re.split(r',', sentence) truth_uni_perplex, truth_bi_perplex = perplexity(sent_arr[1], truth_uni_prob_dict, truth_bi_prob_dict) false_uni_perplex, false_bi_perplex = perplexity(sent_arr[1], false_uni_prob_dict, false_bi_prob_dict) test_list.append((sent_arr[1], truth_bi_perplex, false_bi_perplex)) truth_or_false = 1 if truth_bi_perplex < false_bi_perplex else 0 #truth_or_false = 1 if truth_uni_perplex < false_uni_perplex else 0 if truth_or_false: truth_count += 1 else: false_count += 1 test_truth_false_list.append([i, truth_or_false]) i += 1 import csv with open("kaggle_sharp.csv", "wb") as f: writer = csv.writer(f) writer.writerows([['Id', 'Label']]) writer.writerows(test_truth_false_list) print test_list print test_truth_false_list print truth_count print false_count
def __init__(self,directory): #get list of all tags that can be simplified into synonym tags stf = open(directory+"tags_synonym.csv", 'r') #converting each tag to its hypernym rdr= csv.reader(stf) for r in rdr: #r[0]=tag r[1]=tag it should be replaced with self.synonym_tags[r[0]]=r[1] stf.close() tf=open(directory+"tags.csv", 'r') #assign wieght for tag for each tag rdr=csv.reader(tf) for r in rdr: tmp=r[0].split(';') #tmp[0]=tag tmp[1]=frequency self.tags[tmp[0]]=float(1/float(tmp[1])) tf.close() for tmp in self.tags: t=tmp.split('-') if len(t)>1: t2=tmp.replace('-',' ') #print t2 if t[0] not in self.complex_tags: self.complex_tags[t[0]]=[] self.complex_tags[t[0]].append(t2) #self.complex_tags_replacements[t[0]]=tmp self.complex_tags_replacements[t2]=tmp qf=open(directory+"Questions&Answers&Tags.csv",'r') rdr=csv.reader(qf) for r in rdr: #r[0]:question title r[1]=question title r[2]: best answer r[3]: tags if r[0][len(r[0])-1] not in ['!','?','.']: r[0]=r[0]+'.' r[1]=nltk.clean_html(r[1]) r[2]=nltk.clean_html(r[2]) r[0]=r[0]+' '+r[1] self.questions.append(r[0]) self.answers.append(r[1]) n=len(self.questions)-1 r[3]=r[3].replace('<','') r[3]=r[3].replace('>',' ') tmplist=r[3].split(' ') for t in tmplist: if t in self.synonym_tags: r[3]=r[3].replace(t,self.synonym_tags[t]) tmplist=r[3].split(' ') tmplist.pop() self.tagsInQuestions[n]=tmplist for t in tmplist: if t not in self.questionsForTags: self.questionsForTags[t]=[] self.questionsForTags[t].append(n) qf.close()
def index(): steps = Step.query.order_by(Step.num_de_paso) for step in steps: if step.tipo_de_tramite: step.tipo_de_tramite = clean_html(step.tipo_de_tramite) if step.requisitos: step.requisitos = clean_html(step.requisitos) if step.consideraciones: step.consideraciones = clean_html(step.consideraciones) if step.preguntas_frecuentes: step.preguntas_frecuentes = clean_html(step.preguntas_frecuentes) return render_template('index.html', steps=steps)
def autos_us(): html = open('autos-us.html').read() soup = BeautifulSoup(html) first = soup.find('li').contents[0] second = first.parent.next_sibling.next_sibling.contents[0] third = second.parent.next_sibling.next_sibling.contents[0] majors = [first, second, third] minors = soup.select('ul li ul li') major_tokens = [nltk.clean_html(str(w)) for w in majors] minor_tokens = [nltk.clean_html(str(w)) for w in minors] minor_tokens = [re.sub(r'\s\([\S\s]+\)|\[\s\S\s\]|\n\s[A-Za-z]+', r'', token) for token in minor_tokens] tokens = list(set(major_tokens + minor_tokens)) return tokens
def gasPrices(origin, destination): one_way_cost = '' from_address = origin to_address = destination new_from_address = from_address.replace(" ", "+") new_to_address = to_address.replace(" ", "+") url = "http://www.travelmath.com/cost-of-driving/from/" + new_from_address + "/to/" + new_to_address html = urllib.urlopen(url) for line in html: if "costofdriving" and "$" in line: one_way_cost = nltk.clean_html(line.split("one-way")[0].replace("$", "")) round_trip_cost = nltk.clean_html(line.split("one-way")[1].replace("round trip", "").replace("$", "")).replace('/ ', "") break return one_way_cost
def invent_ext(htmlString): start = htmlString.find("Inventors:") end = htmlString.find("Assignee:") end2 = htmlString.find("Appl. No.:") if start == -1: extract = "No Inventors Listed" else: if end == -1: extract = htmlString[start+11:end2] extract = nltk.clean_html(extract) else: extract = htmlString[start+11:end] extract = nltk.clean_html(extract) return extract
def webUrl(fullUrl): #urllib2 works best with a specific url format validUrl = re.compile( r'^(?:http)s?://|' # http:// or https:// r'^(?:http)s?://www.' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) if validUrl.match(fullUrl): finalList = [] urlInput = quote(fullUrl, safe="%/:=&?~#+!$,;'@()*[]") urlInput = urlInput.strip('%0A') try: u = urlopen(urlInput) html = u.read() raw = nltk.clean_html(html) tokens = nltk.word_tokenize(raw) if args.minLength or args.maxLength: for token in tokens: if not(len(token.translate(None,charBlacklist)) < minl or len(token) > maxl): wordList.append(str(token).translate(None,charBlacklist)) else: for token in tokens: wordList.append(str(token).translate(None,charBlacklist)) print "Scraping URL - {0}".format(fullUrl) except Exception as e: print 'There was an error connecting to or parsing {0}'.format(fullUrl) print 'Error: %s' % e else: print 'INVALID URL - {0}. Format must be http(s)://www.smeegesec.com.'.format(fullUrl)
def Create_index_from_url( url, depth ): if depth > MAX_DEPTH: return [] url_queue = Queue() url_queue.put( url ) checked = [] IndexGen = Index_Generator() while not url_queue.empty() : current_url = url_queue.get() checked.append( current_url ) try: html = Get_page( current_url ) except: print "Exception" continue if depth > 0: for link in Link_generator( html ): #print link if link not in checked: url_queue.put( link ) depth = depth - 1 html = nltk.clean_html( html ) IndexGen.gen_url_index( current_url, html ) result_index = {} result_index = IndexGen.get_index_dict() for key in result_index: result_index[key].sort() return result_index
def counts_pages_words(url): source = requests.get(url) clean = nltk.clean_html(source.text) tokens = nltk.word_tokenize(clean) #can make class nltk.text.Text object out of tokens tokens = [word for word in tokens if word.lower() not in sw] freqdist = nltk.FreqDist(tokens) return { "title": source.url.title, "freq_dist": freqdist.items() }
def retrieve_editorial(self, a_url): editorial =[] # Open URL object print a_url, " < url" try: contents = self.url_read(a_url) para_ct = 0 for para in re.finditer(r'<p>(.*?)</p>', contents, re.DOTALL): try: para = para.groups()[0] if dbg: print "para ", len(para) para_ct += len(para) cleaned = nltk.clean_html(para) self.toks = cleaned.split() # self.toks = nltk.word_tokenize(cleaned) self.toks = [it.lower() for it in self.toks] self.remove_punctuation() if dbg: print(self.toks) editorial.extend(self.toks) except Exception, e: print para print e print para_ct, 'symbols'
def test3(): import nltk from nltk.corpus import conll2000 from urllib import urlopen fname = 'data/dummy/webpages/Abby_Watkins/raw/002/index.html' doc = urlopen(fname).read() raw = nltk.clean_html(doc) decoded = raw.decode('utf-8', errors='ignore') raw = decoded.encode('utf-8') print raw sentences = nltk.sent_tokenize(raw) sentences = [s.replace('\n', '').replace('\r', '').strip() for s in sentences] sentences = [nltk.word_tokenize(s) for s in sentences] sentences = [nltk.pos_tag(s) for s in sentences] #porter = nltk.PorterStemmer() #sentences = [[(porter.stem(w[0]), w[1]) for w in s] for s in sentences] #sentences = [[w[0] for w in s] for s in sentences] #sentences = [['%s_%s' % w for w in s] for s in sentences] lexicon = [] #for s in sentences: #print len(s) #for w in s: # print w[0] #print ' '.join(w[0] for w in s) #print nltk.ne_chunk(s, binary=True) #lexicon.extend(s) fdist = nltk.FreqDist(lexicon)
def get_xmen_text(soup): #en_stopwords = set(nltk.corpus.stopwords.words('english')) raw = nltk.clean_html(str(soup)) raw_trunc = raw[:raw.rfind('References')] sents = nltk.sent_tokenize(raw_trunc) words = [nltk.word_tokenize(sent) for sent in sents] poss = [nltk.pos_tag(word) for word in words] #nes = [nltk.ne_chunk(pos, binary=True) for pos in poss] #for pos in poss: print pos poss_filter = [filter_insignificant(pos, tag_suffixes=['DT']) for pos in poss] print poss_filter nes = [nltk.ne_chunk(poss_filter, binary=True) for pos in poss_filter] def sub_leaves(tree, node): return [t.leaves() for t in tree.subtrees (lambda s: s.node == node)] people = [sub_leaves(ne, 'NE') for ne in nes] people = [item for sublist in people for subsublist in sublist for subsubsublist in subsublist for item in subsubsublist if item not in ('NNP', 'NN', 'NNPS', 'JJ')] people = merge_people(people) fd = nltk.FreqDist(person for person in people if person!='Magneto') fd.plot(50)
def fetchWebPAge(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,timeout=1) content = response.read() urlDownloaded.append(url) print url soup = BeautifulSoup(content) raw = nltk.clean_html(content) tokenized = nltk.word_tokenize(raw) priority = get_priority(url,content,tokenized) br = mechanize.Browser() br.open(url) for link in br.links(): new_url = urlparse.urljoin(link.base_url,link.url) base = urlparse.urlparse(new_url).hostname path = urlparse.urlparse(new_url).path finalUrl = "http://"+base+path if not pat.search(finalUrl): if finalUrl not in urlVisited: urlQueue.put(finalUrl) urlVisited.append(finalUrl) except socket.timeout, e: raise MyException("[TIMEOUT ERROR]:: %r" % e)
def obtenerNoticias(): # Retorno todas_noticias = [] # Procesamos fuentes for fuente in fuentes: resultado = Noticias() todas_noticias.append(resultado) # Descargamos la fuente noticias = feedparser.parse(fuente) titulo_fuente = noticias['feed']['title'] # Recorremos noticias for noticia in noticias['entries']: noticia_titulo = noticia['title'].encode('UTF-8', 'replace') noticia_resumen = nltk.clean_html(noticia['summary']).encode('UTF-8', 'replace') noticia_enlace = noticia['link'].encode('UTF-8', 'replace') resultado.insertar(noticia_titulo, noticia_resumen, noticia_enlace) resultado = Noticias() longitud = 0 for fuente in todas_noticias: longitud = max(longitud, fuente.longitud()) for i in range(longitud): for fuente in todas_noticias: if (i >= fuente.longitud()): continue resultado.insertar(fuente.obtener(i)[0], fuente.obtener(i)[1], fuente.obtener(i)[2]) return resultado
def format_text_for_NER(raw_text, social_web_platform=None): """ Prepares the given text for named entity extraction. Minimal processing performed in order to remove line breaks, links, etc rather than more substantial formatting like porting or stemming that would interfere with a NER toolkit's ability to recognize entities. """ ''' remove line breaks ''' cleaned_text = raw_text.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ') ''' remove html ''' cleaned_text = nltk.clean_html(cleaned_text) ''' remove links (www.* or http*) ''' cleaned_text = re.sub('((www\.[\s]+)|(https?://[^\s]+))','', cleaned_text) ''' replace double quotes with single quotes to avoid a Wikipedia Miner error ''' cleaned_text = cleaned_text.replace("\"", "\'") ''' remove non-printable characters ''' cleaned_text = filter(lambda x: x in string.printable, cleaned_text) ''' clean any social web platform specific text ''' if social_web_platform != None: cleaned_text = social_web_platform.clean_text(cleaned_text) ''' remove misc. remnant strings we don't care about ''' words_manually_filter = [] cleaned_text = ' '.join([word for word in cleaned_text.split() if not word in words_manually_filter]) return cleaned_text
def parse_detail(self,response): item = TbsItem() headers = response.headers self.set_items_value(item,'character',self.get_page_character(response.body)) self.set_items_value(item,'crawl_stats',self.default_crawl_stats) self.set_items_value(item,'searchkeywords',self.keyword) self.set_items_value(item,'spiderid',self.name) self.set_items_value(item,'refer',response.meta['refer']) self.set_items_value(item,'url_hash_no_fragment', self.get_url_hash_no_fragment(response.url)) self.set_items_value(item,'url', self.parseurl(response.url)) self.set_items_value(item,'root_domain',urlparse(response.url).hostname) self.set_items_value(item,'Expires',self.to_GMT_timestamp(headers['Expires']) if 'Expires' in headers.keys() else self.to_GMT_timestamp(None)) self.set_items_value(item,'LastModified',self.to_GMT_timestamp(headers['Last-Modified']) if 'Last-Modified' in headers.keys() else self.to_GMT_timestamp(None)) try: hxs = HtmlXPathSelector(response) self.set_items_value(item,'title',','.join(hxs.select('//title/text()').extract())) self.set_items_value(item,'desc',','.join(hxs.select('//meta[@name="description"]/@content').extract())) self.set_items_value(item,'keyword',','.join(hxs.select('//meta[@name="keywords"]/@content').extract())) except: self.set_items_value(item,'title',' ') self.set_items_value(item,'desc',' ') self.set_items_value(item,'keyword',' ') self.set_items_value(item,'body',response.body) self.set_items_value(item,'stripedbody',nltk.clean_html(self.strip_body(response.body))) return item
def create_feedset(feed_seq): ''' Call on Google Reader with subscription request and create a set of (title, link) pairs: a Feed Set ''' import nltk # import pickle # f = open('/home/crc/tmp/apollo.pkl', 'rb') # feedset = pickle.load(f) # f.close() pat = re.compile('http://.*$') feedset = [] for eachfeed in feed_seq: feed_str = eachfeed.id() result = pat.search(feed_str) if result is not None: logging.info("Refreshing %d from '%s'...", eachfeed.unread_count(), result.group()) eachfeed.refresh() logging.info("Parsing...") pipe_feed = eachfeed.parse() for entry in pipe_feed.entries: title = nltk.clean_html(entry.title) # actually want 'id' here in order to Edit feedset.append((title, entry.id)) logging.info("Done") return feedset
def scrapePage(url): # Extract page text from a web URL (ignoring navigation links, ads, etc.). try: print "URL: "+ url #url=url.replace('(','%28') #url=url.replace(')','%29') #print "New URL:"+url result = alchemyObj.URLGetText(url) soup = BeautifulSoup(result) raw = soup('text') raw = [text.text for text in raw] rawstr = ' '.join(raw) except Exception: try: print "\n\nscraping using regex" webpage = urllib2.urlopen(url).read() #webpage = str(webpage) para= re.compile('<p>(.*)</p>') #collect data in p tags and store in para object raw = re.findall(para , webpage) rawstr = ' '.join(raw) clean_raw = nltk.clean_html(rawstr) rawstr=clean_raw except Exception: rawstr = "Web page could not be scraped..." print rawstr return rawstr
def fcount(url): import urllib2 import nltk import re rcount=0 try: rpage=urllib2.urlopen(url).read() except: return rcount tbegin=rpage.find("user-rating") tend=rpage.find("review-list") temp=rpage[tbegin:tend] tbegin=temp.find(" Write a Review") tend=temp.find("review-list") temp=temp[tbegin:tend] tbegin=temp.find("of") tend=temp.find("review-list") temp=temp[tbegin:tend] tbegin=temp.find("of") tend=temp.find("review-list") temp=temp[tbegin:tend] tbegin=temp.find("<strong") tend=temp.find("</strong>") temp=temp[tbegin:tend] temp=nltk.clean_html(temp) rcount=temp rcount=re.sub(r'[^0-9]','',rcount) return int(rcount)
def extractchunk(tweettuple): sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(w))) for (a,w) in tweettuple] cid = [str(a) for (a,w) in tweettuple] tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences] pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens) return dict(zip(cid, ne_chunks))
tokens = nltk.word_tokenize(raw) type(tokens) len(tokens) tokens[80:110] text = nltk.Text(tokens) text.collocations() # Online articles url = "http://www.bbc.co.uk/news/science-environment-21471908" #Getting text out of HTML is a sufficiently common task that NLTK provides a helper function nltk.clean_html(), which takes an HTML string and returns raw text. html = urlopen(url).read() html[:60] raw = nltk.clean_html(html) tokens = nltk.word_tokenize(raw) tokens[:15] #Processing RSS Feeds: import feedparser # Reading local files f = open('C:\Data\Files\UK_natl_2010_en_Lab.txt') raw = f.read() print raw[:100] # User input s = raw_input("Enter some text: ") #Regular exppressions applications. Find and count all vowels.
#else: #print "skipped twitter" #searchset.append('next') # for testing a new way out. #print " List of Seach Result Pages = %s" % searchset #exit() row_data1 = [] # Creating content array of all the pages returned from Google for testurls in searchset: filename1=myopener.open(testurls).read() readable_data1= Document(filename1).summary() # Removing the HTML tags from the web page for processing tempval = nltk.clean_html(readable_data1) row_data1.append(tempval) #print datetime.datetime.now() - t0 #print row_data1 result = [] temp_result=[_getAnswer("",row_data,node) for row_data in row_data1] #print "Result Set = %s "% result word_freq = {} # Count the frequency of accorunces of results from all pages for word in result: word_freq[word] = word_freq.get(word, 0) + 1
from __future__ import division import nltk, re, pprint # 3.1 访问网络电子书 from urllib.request import urlopen url = "http://www.gutenberg.org/files/2554/2554.txt" raw = urlopen(url).read() print(len(raw)) # 指定代理的读取网络文件 proxies = {'http': 'http://www.someproxy.com:3128'} raw = urlopen(url, proxies=proxies).read() # 分词(NLTK 自带正则分词器) tokens = nltk.word_tokenize(raw) # 产生我们所熟悉的结构,一个词汇和标点符号的链表 text = nltk.Text(tokens) # 寻找文本开头结束位置. raw.find("PART I") raw.rfind("End of Project Gutenberg's Crime") # 3.2 处理的 HTML url = "http://news.bbc.co.uk/2/hi/health/2284783.stm" html = urlopen(url).read() raw = nltk.clean_html(html) #返回原始文本 tokens = nltk.word_tokenize(raw) # 寻找开头结尾,获得需要的文本 # 2 处理搜索引擎结果 # 3.3 词干提取器(自带/正则自定义0 # 3.4 词形归并
def create_features(X, user_data=None): res = [] for date, comment, user in X: feat = {} has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0 has_fwenzel_word = 0 has_swastika = swastika in comment comment = comment.lower() comment = parse_text(comment) comment = nltk.clean_html(comment) sents = sent_tokenize(comment) doc = [] for sent in sents: # Tokenize each sentence. doc += wordtokenizer.tokenize(sent) def repl_filter(x): return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"] # Remove stopwords and replacement tokens. doc = filter(repl_filter, doc) for i, word in enumerate(doc): if doc[i] in bad_words: doc[i] = '_badword_' doc[i] = ps.stem(doc[i]) doc[i] = wnl.lemmatize(doc[i]) if doc[i] in bad_words: doc[i] = '_badword_' if doc[i] in hate_words: has_hate_word = 1 if doc[i] in drug_words: has_drug_word = 1 if doc[i] in cult_words: has_cult_word = 1 if doc[i] in occult_words: has_occult_word = 1 if doc[i] in porn_words: has_porn_word = 1 if doc[i] in fwenzel_words: has_fwenzel_word = 1 bigram_finder = BigramCollocationFinder.from_words(doc) bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5) bigram = dict([(ngram, True) for ngram in itertools.chain(doc, bigrams)]) feat.update(bigram) text_vocab = set(w for w in doc if w.isalpha()) unusual = text_vocab.difference(english_vocab) unusual_ratio = len(unusual) / len(text_vocab) if len( text_vocab) != 0 else -1.0 unusual2 = unusual.difference(set("_badword_")) unusual_ratio2 = len(unusual2) / len(text_vocab) if len( text_vocab) != 0 else -1.0 if user_data is not None: user_info = user_data[user] has_bad_word = True for word in bad_words: if word in comment.lower(): break else: has_bad_word = False def n_none(x): return int(x) if x is not None else 0 def c_none(x): return x if x is not None else "__None__" readability = ReadabilityTool(comment) read_feat = {} for f, val in readability.analyzedVars.items(): if f != 'words': read_feat["_" + f] = val for test, val in readability.tests_given_lang['eng'].items(): read_feat["__" + test] = val(readability.text) feat['_always_present'] = True feat['_word_num'] = len(doc) feat['_sent_num'] = len(sents) feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0 feat['_sent_var'] = len(set(sents)) / len(sents) feat['_unusual_ratio'] = unusual_ratio feat['_unusual_ratio2'] = unusual_ratio2 if user_data is not None: feat['_username'] = user feat['_user_subcount'] = int(user_info['SubscriberCount']) feat['_user_friends'] = int(user_info['FriendsAdded']) feat['_user_favs'] = int(user_info['VideosFavourited']) feat['_user_videorates'] = int(user_info['VideosRated']) feat['_user_videouploads'] = int(user_info['VideosUploaded']) feat['_user_videocomments'] = int(user_info['VideosCommented']) feat['_user_videoshares'] = int(user_info['VideosShared']) feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded']) feat['_user_gender'] = c_none(user_info['Gender']) feat['_user_age'] = n_none(user_info['Age']) feat['_user_closed'] = user_info['UserAccountClosed'] feat['_user_suspended'] = user_info['UserAccountSuspended'] feat['_user_has_gender'] = 1 if user_info[ 'Gender'] is not None else 0 feat['_user_has_school'] = 1 if user_info[ 'School'] is not None else 0 feat[ '_user_has_books'] = 1 if user_info['Books'] is not None else 0 feat['_user_has_movies'] = 1 if user_info[ 'Movies'] is not None else 0 feat[ '_user_has_music'] = 1 if user_info['Music'] is not None else 0 feat['_user_has_location'] = 1 if user_info[ 'Location'] is not None else 0 feat['_user_has_hometown'] = 1 if user_info[ 'Hometown'] is not None else 0 # feat['_user_last'] = user_info['LastWebAccess'] # Dictionary features feat['_has_bad_word'] = has_bad_word # feat['_has_hate_word'] = has_hate_word # feat['_has_drug_word'] = has_drug_word feat['_has_cult_word'] = has_cult_word feat['_has_swastika'] = has_swastika # feat['_has_occult_word'] = has_occult_word # feat['_has_has_fwenzel_word'] = has_fwenzel_word feat['_has_porn_word'] = has_porn_word feat['_has_swastika'] = has_swastika feat.update(read_feat) # print feat res.append(feat) return res
import nltk from nltk import word_tokenize from urllib import request # change the path to where the nltk data is being stored nltk.data.path.append('/Users/zhi/Documents/Programming/PROJECTS_Python/data') # html html = urlopen(url).read() # download web page raw = nltk.clean_html(html) # strip remaining html raw = raw[750:23506] # trim to desired content # ascii tokens = nltk.wordpunct_tokenize(raw) # tokenize the text tokens = tokens[20:1834] # select tokens of interest text = nltk.Text(tokens) # create nltk text # vocab words = [w.lower() for w in text] # normalize the words vocab = sorted(set(words)) # build the vocabulary
url = 'http://www.cnn.com/2014/07/19/world/europe/ukraine-malaysia-airlines-crash/' ## fetch html import requests r= requests.get(url) html = r.content ##nltk: fetch text by cleaning html import nltk text = nltk.clean_html(html) ##fetch text based on density :useful text import usefulText as u text = u.extract_text(html) ## unicode text = text.decode('utf-8','ignore') ## segment into sentences import sys sys.path.append('../version0.0/') import segment_sentence as ss def isProper(sentence): if len(sentence) <=5: return False if '|' in sentence: return False
import sys reload(sys) sys.setdefaultencoding('utf-8') import json import pandas as pd import re import nltk import jieba data = json.loads(open('../data/cookbook.json', 'r').read()) df = pd.DataFrame(data) steps = [] for _s in df['steps']: step = [] for s in _s: s = re.sub(r'\r\n', '\n', s) s = re.sub(r'\t', ' ', s) s = nltk.clean_html(re.sub(r' +', ' ', s)) + '\n' step.append(s) steps.append(''.join(step)) all_steps = ''.join(steps) seg_list = jieba.cut(all_steps) fdist = nltk.FreqDist(seg_list) for m in fdist: print '%s : %s' % (m, fdist[m])
train_header = train_file.next() test_file = csv.reader(open("Test.csv", "rb")) test_header = test_file.next() result_file = open("Result.csv", "w") result_file.write('"Id","Tags"\n') traindata = [] testdata = [] docs = [] print "Train Start" i = 0 for data in train_file: tokens = re.split(r"\W+", nltk.clean_html(data[2])) #tokens = nltk.word_tokenize(nltk.clean_html(data[2])) docs.append(tokens) i += 1 if i > 100000: break print "Make collection start" # Make the collection for calculating TF-IDF collection = nltk.TextCollection(docs) print "Testing data start" for data in test_file: title_tokens = nltk.word_tokenize(data[1]) tokens = re.split(r"\W+", nltk.clean_html(data[2]))
def compute_score(queryphrase="", keywords=[], answers=[], urls=[], scorevalue=0, rangevalue=0, left=False, right=False): urls = getGoogleLinks(queryphrase, 3) if keywords == []: keywords = getKeywords(queryphrase) print(keywords) keyword = True combinedtokens = [] for url in urls: html = urlopen(url).read() raw = nltk.clean_html(html) combinedtokens += nltk.word_tokenize(raw) combinedtokens = [t for t in combinedtokens if len(t) > 2 and t.lower() not in ignored_words] querytokens = nltk.word_tokenize(queryphrase) # only supports two keywords if keyword == True: instances = {} tokenrange = findrange(len(combinedtokens)) for word in keywords: for i in tokenrange: if combinedtokens[i] == word: if word not in instances.keys(): instances[word] = [i] else: instances[word].append(i) combinedinstances = [] # right now only two keywords are supported if len(keywords) != 1 and len(keywords) != 2: print "error, number of keywords must be one or two" return 4 if len(keywords) == 2: for instanceone in instances[keywords[0]]: for instancetwo in instances[keywords[1]]: if (instancetwo - instanceone) < 20 and (instanceone, instancetwo) not in combinedinstances: combinedinstances.append((instanceone, instancetwo)) elif (instanceone - instancetwo) < 20 and (instancetwo, instanceone) not in combinedinstances: combinedinstances.append((instancetwo, instanceone)) # print(combinedinstances) relevanttokens = [] if len(keywords) == 1: for instance in instances[keywords[0]]: relevanttokens += combinedtokens[instance - rangevalue : instance + rangevalue] else: for leftinstance, rightinstance in combinedinstances: relevanttokens += combinedtokens[leftinstance - rangevalue : rightinstance + rangevalue] # print(relevanttokens) relevanttokenrange = findrange(len(relevanttokens)) scores = {} for answer in answers: answertokens = nltk.word_tokenize(answer) length = len(answertokens) for i in relevanttokenrange: if relevanttokens[i : (i + length)] == answertokens: if answer not in scores.keys(): scores[answer] = scorevalue else: scores[answer] += scorevalue print scores return scores
def element_14_snippet_getter(url, keywords=GLOBAL_KEYWORDS): """ Given a URL and keywords, returns a list of text snippets including those keywords. Inputs: 1. url: str: the desired URL 2. keywords: list of str's: defaults to global_keywords defined in main(); the desired keywords Output: 1. snippets: list of unicode's: the desired text snippets """ # Pulls clean text from the URL, devoid of HTML. try: html = urlopen(url).read() except UnicodeError: print "UnicodeError thrown. Skipped offending URL (probably not in English and thus unanalyzable)." return [] soup = BS(html) raw = nltk.clean_html(html) # We don't want propaganda written by Freescale employees. if ("FreescaleTools_and_Software" in raw) or ("GregC" in raw) or ("MAb" in raw): return [] # Finds all HTML subtrees that tell us the date of writing of each post. post_data = soup.find_all(class_="j-post-author") # Finds all HTML subtrees comprising the posts themselves. posts = soup.find_all(class_="jive-rendered-content") snippets = [] tokenizer = ST() dates = [] # Assembles the dates of writing of each post. for post_datum in post_data: date_posted = date_getter(post_datum) dates.append(date_posted) # For each post in the page, grabs the text and metadata. for i in range(len(posts)): text = posts[i].get_text() # Splits the text into its individual sentences so that we can pick the ones we like. intermediate_snippets = tokenizer.tokenize(text) # Grabs text containing keywords, as well as sentences preceding and following those with keywords. for j in range(len(intermediate_snippets)): for word in keywords: snippet = intermediate_snippets[j] if (word in snippet.lower()) and ("http" not in snippet.lower()): offset = 1 for k in range(j, len(intermediate_snippets)): if word not in intermediate_snippets[k].lower(): break else: offset += 1 if j == 0: subsnippet = intermediate_snippets[j:offset] else: subsnippet = intermediate_snippets[j-1:j+offset] # Puts the individual sentences back together. tokens = [] for sentence in subsnippet: stripped = sentence.lstrip().rstrip().encode("UTF-8") tokens.append(stripped) for token in tokens: try: if len(token) < 1500: snippets.append([token, url, dates[i][0], dates[i][1], dates[i][2]]) except IndexError: continue return snippets
def removeHTML(data): """Clean up html""" return [nltk.clean_html(dat) for dat in data]
def clean_and_tag_all(): """ Create new CSV containing tagged versions of all sentences """ # set filepath to input basepath = os.path.dirname(__file__) file_in = os.path.abspath( os.path.join(basepath, '..', 'reuters/csv/single_records.csv')) file_out = os.path.abspath( os.path.join(basepath, '..', 'reuters/csv/sentences_POS.csv')) sentence_splitter = set_up_tokenizer() chunker = chunking.set_up_chunker() stemmer = nltk.SnowballStemmer('english') with open(file_in, 'rb') as csv_in: with open(file_out, 'wb') as csv_out: csv_reader = csv.DictReader( csv_in, ['SOURCE_ID', 'DRUGS', 'COMPANIES', 'SENTENCE'], delimiter=',') csv_writer = csv.DictWriter(csv_out, [ 'SOURCE_ID', 'SENT_NUM', 'SENTENCE', 'NO_PUNCT', 'DRUGS', 'COMPANIES', 'POS_TAGS', 'CHUNKS' ], delimiter=',') csv_writer.writeheader() #csv_reader.next() for row in csv_reader: # display progress bar sys.stdout.write('.') sys.stdout.flush() # clean up html tags # named SENTENCE in the reader so it works nicely when writing row plaintext = nltk.clean_html(row['SENTENCE']) # this in particular seems to be screwing up some of the sentence splitting plaintext = plaintext.replace('Inc .', 'Inc.') # split into sentences sentences = sentence_splitter.tokenize(plaintext) if len(sentences) > 0: for i, s in enumerate(sentences): # TODO integrate stanford NER recognition output into this # clean up sentence s, no_punct = remove_punctuation(s) # CHUNKING - need to include punctuation for this to be anywhere near accurate tokens = nltk.pos_tag(nltk.word_tokenize(s)) chunks = chunker.parse(tokens) # POS TAGS - don't want to include punctuation tokens = nltk.word_tokenize(no_punct) # put the hyphens back after tokenisation # underscores mean that the tokens are better recognised when tagging no_punct = no_punct.replace('_', '-') s = s.replace('_', '-') tags = nltk.pos_tag(tokens) # STEMMING - add stemmed version of word to end of each tagged token tags = [(token, tag, stemmer.stem(token.lower())) for (token, tag) in tags] # TODO parse tree info, chunking, something to do with stemming? # ignore any rogue bits of punctuation etc if len(tags) > 1: # write row to file for each sentence new_fields = { 'SENT_NUM': i, 'SENTENCE': s, 'NO_PUNCT': no_punct, 'POS_TAGS': tags, 'CHUNKS': chunks } row.update(new_fields) csv_writer.writerow(row) print 'Written to sentences_POS.csv'
from collections import Counter as C import urllib2 import nltk urls = [] with open('manisourcehtml1.html', 'r+') as f: urls = map( lambda x: "http://en.wikipedia.org" + x[13:x.index('" title')], filter(lambda x: x.startswith('<td><a href="/wiki/') and '(' not in x, f.readlines())) print urls import pdb pdb.set_trace() c = C() with open('manibackup1.txt', 'w+') as f: for url in urls: raw = nltk.clean_html(urllib2.urlopen(url).read()) if '^' in raw: raw = raw[:raw.index('^')] raws = raw.split() print url c.update(C(filter(lambda x:all(map(str.isalpha,x)) and len(x)>3,map(lambda x:str.lower(x),\ raws)))) f.write(str(c))
bad.append(pl) continue j=j+1 if j%10==0: pass#print j tl = entry.get('tag_list') grams = set([x.strip() for x in tl.replace('-',' ').split(',')]) if tl else set() txt = entry.get('overview') # clean tags and text # 1. strip eol, apostrophes, numbers, HTML # 2. all other punctuation to spaces # 3. Break into sentences if txt: txt2 = nltk.clean_html(txt.replace("\n"," ").encode('ascii','ignore').replace('\\/','/').replace("'","")) txt3 = ptn5.sub(" ",ptn4.sub(".",ptn3.sub(" ",ptn2.sub("",txt2)))) sents = ptn6.split(txt3) # tokenize sentences for sent in sents: sent1 = ptn.sub("",sent.lower().replace("."," ")) sent2 = sent1.split() grams.update(set(nltk.bigrams(sent2))) grams.update(set(nltk.trigrams(sent2))) # gramcnt = {} # for gram in grams: gramcnt[gram]=gramcnt.get(gram,0)+1 # save (pl,{gram:x,gram:y,gram:z,...}) cograms.append((pl,list(grams)))
help='output file') (options, args) = parser.parse_args() SAMPLE_URLS = ['http://www.henryklahola.nazory.cz/Vira.htm', 'http://www.henryklahola.nazory.cz/Snatek.htm',] \ if not options.sample else options.sample.split(' ') WORDS = 500 if not options.words else int(options.words) NGRAM = 3 if not options.bigrams else 2 samples = [] if options.sample: for url in SAMPLE_URLS: sample = unicode( BeautifulSoup(urlopen(url), convertEntities=BeautifulSoup.HTML_ENTITIES)) samples.append(nltk.clean_html(sample)) elif options.input: samples = [open(options.input).read().decode('utf8')] tokenizer = nltk.tokenize.WordPunctTokenizer() tokenized = tokenizer.tokenize(' '.join(samples)) warnings.simplefilter("ignore") model = nltk.NgramModel(NGRAM, tokenized) starts = model.generate(100)[-2:] generated = model.generate(WORDS, starts) out = ' '.join(generated).encode('utf8').replace(' , ', ', ').replace(' . ', '. ') out = '%s%s...' % (out[0].upper(), out[1:]) if options.output:
def get_first(url, count): raw = nltk.clean_html(urllib.urlopen(url).read()) return (raw[:count],raw[count:])
def get_text(html): soup = BeautifulSoup(html) text = soup.find('div', id='article_body') #print text text = nltk.clean_html(str(text)) return text
def clean_up(text): return nltk.clean_html(xml.sax.saxutils.unescape(text))
from urllib import urlopen import csv from nltk.corpus import stopwords url = "http://hbr.org/2013/04/now-is-our-time/ar/1" #keeping this along with other urls proxies = {'http': 'http://*****:*****@10.1.9.23:8080'} raw = urlopen(url, proxies=proxies).read() cleanraw = nltk.clean_html(raw) #raw.txt contains the raw text f = open('raw.txt', 'w') f.write(cleanraw) f.close() #tokenize cleanraw tok_clean = nltk.word_tokenize(cleanraw) #removing all the smartquotes clean = []
import requests import json import nltk url="http://*****:*****@localhost:7474/db/data/cypher" url1="http://*****:*****@localhost:7474/db/data/node/" payload={} for i in resp: elastic=url1+str(i[0]) payload['link']=uri+str(i[0]) payload['title']=i[1] payload['content']=nltk.clean_html(i[2]) #print url1 print elastic print payload print requests.put(elastic,data=json.dumps(payload)).json()
def clean_and_tag(): """ Create new CSV containing all relevant sentences """ # set filepath to input basepath = os.path.dirname(__file__) file_in = 'data/reuters/press_releases/PR_drug_company_500.csv' file_in = os.path.abspath(os.path.join(basepath, '..', '..', file_in)) file_out = os.path.abspath( os.path.join(basepath, '..', 'reuters/sentences_POS.csv')) # set up sentence splitter with custom parameters punkt_params = punkt.PunktParameters() # sentences are not split ending on the given parameters, using {} creates a set literal punkt_params.abbrev_types = { 'inc', 'inc ', '.tm', 'tm', 'no', 'i.v', 'drs', 'u.s' } # the tokenizer has to be unpickled so better do it once here than every time it is used sentence_splitter = punkt.PunktSentenceTokenizer(punkt_params) with open(file_in, 'rb') as csv_in: with open(file_out, 'wb') as csv_out: # TO DO use dictionary reader to avoid using magic numbers for columns csv_reader = csv.reader(csv_in, delimiter=',') csv_writer = csv.writer(csv_out, delimiter=',') # write column headers on first row row = csv_reader.next() row.append('POS TAGS') csv_writer.writerow(row) for row in csv_reader: # use stdout to avoid spaces and newlines sys.stdout.write('.') # need to flush the buffer to display immediately sys.stdout.flush() # clean up html tags plaintext = nltk.clean_html(row[1]) drug = row[3] company = row[5] src = row[0] # only consider texts containing both the drug and company if drug in plaintext and company in plaintext: sentences = sentence_splitter.tokenize(plaintext) # filter for only sentences mentioning drug, company or both # TO DO coreference resolution to find more relevant sentences sentences = [ s for s in sentences if drug in s or company in s ] if len(sentences) > 0: for s in sentences: # remove punctuation, still want to add original sentence to CSV though no_punct = re.findall(r'[\w\$\xc2()-]+', s) no_punct = ' '.join(no_punct) tokens = nltk.word_tokenize(no_punct) tags = nltk.pos_tag(tokens) # TO DO parse tree info, something to do with stemming? # write row to file for each sentence row.append(tags) csv_writer.writerow( [src, s, row[2], drug, row[4], company, tags])
def parse_page_text(url): response = requests.get(url, headers={'User-agent': USER_AGENT}) html = response.text readable_html = readability.readability.Document(html) try: article_only = readable_html.summary() except: return [] raw = nltk.clean_html(article_only) #soup = bs4.BeautifulSoup(html) #raw = nltk.clean_html(str(soup)) sents = nltk.sent_tokenize(raw) sents = [nltk.wordpunct_tokenize(sent) for sent in sents] #sents = [nltk.tokenize.WhitespaceTokenizer().tokenize(sent) for sent in sents] tagged_sents = [nltk.pos_tag(sent) for sent in sents] # get interesting collocations #words = nltk.wordpunct_tokenize(raw) words = nltk.tokenize.WhitespaceTokenizer().tokenize(raw) words = [word.lower() for word in words] punctuation = re.compile(r'[-.?!,":;()]') good_words = [punctuation.sub("", word) for word in words] bigram_finder = nltk.collocations.BigramCollocationFinder.from_words( good_words) trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words( good_words) bigram_finder.apply_freq_filter(2) trigram_finder.apply_freq_filter(1) bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() collocations = [] collocations.extend(bigram_finder.nbest(bigram_measures.pmi, 10)) collocations.extend(trigram_finder.nbest(trigram_measures.pmi, 10)) print "\nCOLLOCATIONS :", collocations # get named entities ne_chunks = [nltk.ne_chunk(sent, binary=True) for sent in tagged_sents] nes = [sub_leaves(ne_chunk, 'NE') for ne_chunk in ne_chunks] entities = [] for ne in nes: if len(ne) == 0: continue ne_string = '' for pairs in ne: for pair in pairs: ne_string = ' '.join((ne_string, pair[0])) entities.append(ne_string[1:]) print "\nNES :", entities # get noun phrases nps = [] grammar = r""" NP: {<PP\$>? <JJ>* <NN.*>+} # NP P: {<IN>} # Preposition V: {<V.*>} # Verb PP: {<P> <NP>} # PP -> P NP VP: {<V> <NP|PP>*} # VP -> V (NP|PP)* """ cp = nltk.RegexpParser(grammar) for sent in tagged_sents: tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.node == 'NP': try: subtree = str(subtree).split()[1:] except UnicodeEncodeError: continue # HACK HACK HACK subtree = ' '.join([item.split('/')[0] for item in subtree]) nps.append(subtree) print "\nNPS :", nps return nps
def localFile(fileInput): if os.path.isfile(fileInput): print "Scraping Local File - {0}".format(fileInput) mimetypes.init() file_type, file_encoding = mimetypes.guess_type(fileInput) print file_type if file_type == 'application/pdf': getPDFContent(fileInput) elif file_type == 'text/html': raw = nltk.clean_html(open(fileInput).read()) tokens = nltk.word_tokenize(raw) if args.minLength or args.maxLength: for token in tokens: if not (len(token.translate(None, charBlacklist)) < minl or len(token) > maxl): wordList.append( str(token).translate(None, charBlacklist)) else: for token in tokens: wordList.append(str(token).translate(None, charBlacklist)) elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': document = docx.opendocx(fileInput) sentances = docx.getdocumenttext(document) sentances = map(lambda s: s.encode("ascii", "ignore"), sentances) if args.minLength or args.maxLength: for sentance in sentances: for word in set(sentance.split()): if not (len(str(word).translate(None, charBlacklist)) < minl or len(str(word)) > maxl): wordList.append( str(word).translate(None, charBlacklist)) else: for sentance in sentances: for word in set(sentance.split()): wordList.append( str(word).translate(None, charBlacklist)) elif file_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation' or file_type == 'application/x-mspowerpoint.12': try: prs = pptx.Presentation(fileInput) text_runs = list() for slide in prs.slides: for shape in slide.shapes: if not shape.has_textframe: continue for paragraph in shape.textframe.paragraphs: for run in paragraph.runs: text_runs.append(run.text) if args.minLength or args.maxLength: for sentance in text_runs: for word in set(sentance.split()): if not (len( str((word.translate(None, charBlacklist)))) < minl or len(str(word)) > maxl): wordList.append( str(word).translate(None, charBlacklist)) else: for sentance in text_runs: for word in set(sentance.split()): wordList.append( str(word).translate(None, charBlacklist)) except Exception as e: print 'Error opening file: {0}'.format(fileInput) pass else: #'text/plain' or unknown format try: words = set(open(fileInput).read().split()) if args.minLength or args.maxLength: for word in words: if not (len(str( (word.translate(None, charBlacklist)))) < minl or len(str(word)) > maxl): wordList.append( str(word).translate(None, charBlacklist)) else: for word in words: wordList.append( str(word).translate(None, charBlacklist)) except: print 'Error opening file: {0}'.format(fileInput) pass else: print 'Error opening file: {0}'.format(fileInput)
def classification_format(self, raw, subject=None): msg = nltk.clean_html(raw) fs = self.extract_features(msg, subject) return fs
# taglist_reduced.append(tagitem) # In[306]: taglist_reduced[:10] # In[300]: corpora = [] for i in filenames: doc = open('/Users/brandomr/Sites/docs/' + i) text = doc.read() #grabs the document as variable text text = nltk.clean_html(text) #strips html formatting text = text.replace(' ', '\xA0') text = text.decode('utf-8', 'ignore') #gets rid of non-break space html and converts to unicode corpora.append(text) #adds to corpora # In[301]: #tokenizes and chunks for entity extraction def extract_entities(text): entities = []
def gett(): url = "http://www.50states.com/facts/alabama.htm" html = urlopen(url).read() raw = nltk.clean_html(html) print(raw)
def cleanHtml(html): return BeautifulStoneSoup( clean_html(html), convertEntities=BeautifulStoneSoup.HTML_ENTITIES).contents[0]
# 16/08/12 - code tested # # check for nltk try: import nltk except ImportError: print "No nltk module, exiting!" exit() from urllib import urlopen import re url_home = "http://var2.astro.cz/ETD/" html_home = urlopen(url_home).read() raw = nltk.clean_html(html_home).split('\n') star_name, planet = [], [] # find latest list of planets from ETD homepage for i in range(0, len(raw)): if "Known transiters" in raw[i]: loc = i + 1 for i in range(loc, len(raw)): if len(raw[i].split()) == 2: star_name.append(raw[i].split()[0]) planet.append(raw[i].split()[1]) if len(raw[i].split()) == 3: name = str(raw[i].split()[0]) + "%20" + str(raw[i].split()[1])
def html2text(str): return clean_html(str)
def claim_ext(htmlString): a = htmlString.find("Claims") b = htmlString.find("Description") elem = nltk.clean_html(htmlString[a:b]) return elem