def getPageText(title, langcode, stoplist, logf): global last_api_request api_wait(last_api_request) last_api_request = datetime.datetime.now() resp = urllib.urlopen(API_HTML % (langcode, title)) data = json.load(resp) if 'error' in data: print >>logf, '\tmissing page' raise MissingPage() p = data['parse'] html = p['text']['*'] if not html.strip(): print >>logf, '\tempty HTML parse returned by API' raise EmptyHTML() soup = BeautifulSoup(html, 'lxml') [x.extract() for x in soup.findAll('div', 'toc')]# table of content [x.extract() for x in soup.findAll('ol', 'references')] [x.extract() for x in soup.findAll('div', 'navbox')] [x.extract() for x in soup.findAll('pre')] """[x.extract() for x in soup.findAll('table', 'notice noprint notice-todo')] [x.extract() for x in soup.findAll('table', 'plainlinks cmbox cmbox-content')] [x.extract() for x in soup.findAll('table', 'plainlinks tmbox tmbox-notice')] [x.extract() for x in soup.findAll('table', 'metadata plainlinks ambox ambox-speedy')] [x.extract() for x in soup.findAll('table', 'plainlinks cmbox cmbox-speedy')] [x.extract() for x in soup.findAll('table', 'metadata plainlinks ambox ambox-delete')] [x.extract() for x in soup.findAll('table', 'plainlinks noprint xambox xambox-type-notice')] [x.extract() for x in soup.findAll('table', 'plainlinks xambox xambox-type-notice')] [x.extract() for x in soup.findAll('table', 'plainlinks xambox xambox-type-content')] [x.extract() for x in soup.findAll('div', 'label_message')] [x.extract() for x in soup.findAll('div', 'boilerplate metadata')] [x.extract() for x in soup.findAll('div', 'noprint request box')] [x.extract() for x in soup.findAll('td', 'mbox-text')]""" if stoplist == 'None': #with stopwords_high = 0, stopwords_low = 0 language is ignored paragraphs = justext.justext(soup.encode('utf-8'), justext.get_stoplist('English'), stopwords_high = 0, stopwords_low = 0, no_headings=True, max_link_density=0.5) else: paragraphs = justext.justext(soup.encode('utf-8'), justext.get_stoplist(stoplist), no_headings=True, max_link_density=0.5, stopwords_low = 0.35, stopwords_high = 0.37) text = '' parSum = 0 charSum = 0 wordSum = 0 for paragraph in paragraphs: if paragraph['cfclass'] == 'good' or paragraph['cfclass'] == 'neargood': line = paragraph.get('text') text += '<p>\n' text += line text += '\n</p>\n' parSum += 1 wordSum += paragraph['word_count'] charSum += len(line) if (parSum == 0): print >>logf, '\tempty prevert returned by jusText' raise EmptyJusText() print >>logf, '\t%d words' % wordSum print >>logf, '\t%d paragraphs' % parSum categories = ';'.join([d['*'].replace('"', '') for d in p['categories']]) header = '<doc title="%s" categories="%s" translations="%d" paragraphs="%d" words="%d" chars="%d">\n' %\ (title.decode('utf-8'), categories, len(p['langlinks']), parSum, wordSum, charSum) return header + text + '</doc>\n'
def test_no_titles(self): s='text and some <em>other</em> words <span class="class">that I</span> have in my head now ' html_string = ( '<html><body>' '<p>{0}</p>' '<p>text and some <em>other</em> words <span class="class">that I</span> have in my head now</p>' '<h2>Smaller header</h2>' '<p>footer</p>' '</body></html>' ).format(s*5) pars = justext(html_string, get_stoplist("English")) title = justitle(html_string, pars) assert title == None
def extract_named_entities(article): paragraphs = justext(article.source_text.encode("utf-8"), "English") pprint(paragraphs) text = u" ".join(p["text"] for p in paragraphs if p["class"] == "good" and not p.get("heading")) sentences = nltk.sent_tokenize(text) print sentences tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] tagged_sentences = [nltk.pos_tag(sent) for sent in tokenized_sentences] named_entity_chunks = [nltk.ne_chunk(sent, binary=True) for sent in tagged_sentences] for chunk in named_entity_chunks: for pos in chunk: if isinstance(pos, tuple): continue yield u" ".join((w for (w,p) in pos))
def extract_named_entities(article): paragraphs = justext(article.source_text.encode("utf-8"), "English") pprint(paragraphs) text = u" ".join(p["text"] for p in paragraphs if p["class"] == "good" and not p.get("heading")) sentences = nltk.sent_tokenize(text) print sentences tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] tagged_sentences = [nltk.pos_tag(sent) for sent in tokenized_sentences] named_entity_chunks = [ nltk.ne_chunk(sent, binary=True) for sent in tagged_sentences ] for chunk in named_entity_chunks: for pos in chunk: if isinstance(pos, tuple): continue yield u" ".join((w for (w, p) in pos))
def test_accepts_html_tree(self): try: justext(html_to_dom(html_string), get_stoplist("English")) except JustextMalformedInput: self.fail("justext wasn't supposed to raise JustextMalformedInput exception on string")
def test_no_heading(self): html_string = '<html><body><title>Header | site.com</title></body></html>' pars = justext(html_string, get_stoplist("English")) title = justitle(html_string, pars) assert title == "Header | site.com"