def test_extract(foo_file): r = extract(foo_file) u = [node.tag for node in r] assert u == [ 'article', 'body', ]
def test_extract(test_file): r = extract(test_file) u = [node.tag for node in r] assert u == [ 'article', 'body', ]
def test_extract_tabular(foo_file): r = list(extract(foo_file)) u = [node.tag for node in r] assert u == [ 'article', 'body', ] for node in r[0]: assert node.tag == 'div'
def scrape(url): r = get(url) text = list(extract(r.content)) concat = '' for n in text: concat += str(n.text_content().encode('utf-8')) return concat
def extract_text(self): """Parse fulltext, do keyword extraction using the newspaper lib => newspaper.readthedocs.io """ libextract_nodes = list(extract(self.html.encode("utf-8"))) self.fulltext = libextract_nodes[0].text_content() entities = EntityExtractor(self.fulltext) entities.get_scored_entities() # Averaged Perceptron Tagger self.keywords = entities.get_keywords() # Above median? self.names = entities.get_names() # Filter top
def noise_extractor(url, base_urls): content = requests.get(url).content body = html.fromstring(content) links = body.xpath("//a/@href") full_links = [link for link in links if urlparse(link).netloc in base_urls and urlparse(link).scheme in ['http', 'https']] cur_parse = urlparse(url) cur_base = cur_parse.scheme + '://' + cur_parse.netloc internal_links = [urljoin(cur_base, link) for link in links if urlparse(link).netloc == '' and not link.startswith('#') and urlparse(link).scheme == ''] link_to_explore = full_links + internal_links sample_contents = [requests.get(url).content for url in link_to_explore[:4]] + [content] textnodes = [t for content in sample_contents for t in list(extract(content, count=5))] noise = set() seqs = [[0 for i in xrange(len(textnodes))] for j in xrange(len(textnodes))] for i in xrange(len(textnodes)): t1 = textnodes[i].text_content() for j in xrange(i): t2 = textnodes[j].text_content() seq = difflib.SequenceMatcher(None, t1, t2) if seq.ratio() > 0.9: noise.add(t1) noise.add(t2) return noise
from requests import get from libextract.api import extract r = get('http://en.wikipedia.org/wiki/Information_extraction') textnodes = list(extract(r.content)) import sys reload(sys) sys.setdefaultencoding('utf-8') from IO import IO class libextract: def __init__(self): pass def predict(self, url): print 'Predicting %s' % url d = {} d['Body'] = '' d['Title'] = '' try: html = urllib.urlopen(url).read() d['Body'] = Document(html).summary() d['Title'] = Document(html).short_title() except: print 'Exception %s' % url return d if d['Title'] == None: d['Title'] = ''
def extract_visable_text(html_path): textnodes = list(extract(html_path)) text_str = unicode(''.join((e.text_content().encode('utf-8')) for e in textnodes)) print text_str return text_str
print 'Boilerpipe...' try: article = Extractor(url=eachurl) title = '_' #title = article.getTitle() content = article.getHTML() except: print 'Failed URl %s' %eachurl content = '_' title = '_' body_score[-1].append(fscore(word_tokenize(content), data)) title_score[-1].append(fscore(word_tokenize(title), title_true)) ###################################################################################### print 'libextract...' # html = urllib.urlopen(eachurl).read() textnodes = list(extract(html)) try: content = ' '.join(each.text_content() for each in textnodes[:5]) except: print 'Not combining unicode %s' %eachurl content = '_' title = '_' body_score[-1].append(fscore(word_tokenize(content), data)) title_score[-1].append(fscore(word_tokenize(title), title_true)) ##################################################################################### print 'NewsExtractor ....' NW.predict(eachurl) title = NW.title content = NW.content if fscore(word_tokenize(title), title_true)<0.7: print 'OOOPS.......'
def extract_all_text(html): textnodes = list(extract(html)) text_str = unicode(''.join((e.text_content().encode('utf-8')) for e in textnodes)) return text_str
print 'Boilerpipe...' try: article = Extractor(url=eachurl) title = '_' #title = article.getTitle() content = article.getHTML() except: print 'Failed URl %s' % eachurl content = '_' title = '_' body_score[-1].append(fscore(word_tokenize(content), data)) title_score[-1].append(fscore(word_tokenize(title), title_true)) ###################################################################################### print 'libextract...' # html = urllib.urlopen(eachurl).read() textnodes = list(extract(html)) try: content = ' '.join(each.text_content() for each in textnodes[:5]) except: print 'Not combining unicode %s' % eachurl content = '_' title = '_' body_score[-1].append(fscore(word_tokenize(content), data)) title_score[-1].append(fscore(word_tokenize(title), title_true)) ##################################################################################### print 'NewsExtractor ....' NW.predict(eachurl) title = NW.title content = NW.content if fscore(word_tokenize(title), title_true) < 0.7: print 'OOOPS.......'
from requests import get from libextract.api import extract r = get('http://en.wikipedia.org/wiki/Information_extraction') textnodes = list(extract(r.content)) import sys reload(sys) sys.setdefaultencoding('utf-8') from IO import IO class libextract: def __init__(self): pass def predict(self,url): print 'Predicting %s' %url d = {} d['Body'] = '' d['Title'] = '' try: html = urllib.urlopen(url).read() d['Body'] = Document(html).summary() d['Title'] = Document(html).short_title() except: print 'Exception %s' %url return d if d['Title'] == None:
def print_nodes(html): textnodes = list(extract(html)) text_str = ''.join((e.text_content().encode('utf-8')) for e in textnodes) print text_str