def find_full_text(html_source): doc = Doc(html_source) content = doc.summary() stripped = strip_tags(content) stripped = to_plain_text(stripped) return stripped
def markdownify(url_list, **options): articles = [] images = [] paragraph_links = options['paragraph_links'] wrap_text = options['wrap_text'] preamble = options['preamble'] for url in url_list: req = urllib2.Request(url,None,{'Referer': url_list[0]}) html = urllib2.urlopen(req).read() document = Document(html, url=url) readable_title = document.short_title() summary = document.summary() summary_doc = build_doc(summary) images.extend([a.get('src') for a in summary_doc.findall('.//img')]) articles.append(document.summary()) markdown_articles = [] for (article, url) in zip(articles, url_list): h = html2text.HTML2Text(baseurl=url) h.inline_links = False h.links_each_paragraph = (paragraph_links and 1) or 0 h.body_width = (wrap_text and 78) or 0 markdown_articles.append(h.handle(article)) combined_article = u"\n\n----\n\n".join(markdown_articles) if preamble: combined_article = (u"Title: %s \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article return combined_article.encode("utf-8")
def run(index): print "Index %d" % index dirname = "data/%04d" % index # url of english article url = open(dirname + "/url_en.txt").read() # download html html = urllib.urlopen(url).read().decode('latin-1') # apply readability document = Document(html) article = document.summary() article = nltk.clean_html(article) # replace latin characters article = re.sub(u' ', u'\n', article) article = re.sub(u'\x92', u'`', article) article = re.sub(u'\x96', u'-', article) # article_en.txt output = codecs.open(dirname + "/article_en.txt", 'w', encoding='ascii', errors='ignore') output.write(article) output.close() # title.txt output = codecs.open(dirname + "/title.txt", 'w', encoding='ascii', errors='ignore') output.write(document.title()) output.close()
def extract_article(url): r = requests.get(url) # the the url exists, continue if r.status_code == 200: # extract and parse response url url = parse_url(r.url) # extract html html = r.content.decode('utf-8', errors='ignore') # run boilerpipe # boilerpipe_extractor = Extractor(html=html) # run readability readability_extractor = Document(html) html = readability_extractor.summary() # return article data return { 'title': readability_extractor.short_title(), 'html': html, 'content': strip_tags(html).encode('utf-8', errors='ignore'), 'url': url } # otherwise return an empty dict else: return {}
def get_screen_play(self, url): """Download webpage and analyze basic sequence :param url: :return: """ res = requests.get(url) html = res.content.decode('utf-8') # Analyze basic sequence readable_article = Document(html).summary() self.readable_article = readable_article readable_title = Document(html).title() self.readable_title = readable_title base_url = path.dirname(res.request.url) result = Extractor(base_url).html_to_asset_list(readable_article) #print(result) df_screenplay = pd.DataFrame(result, columns=['type', 'content']) df_screenplay['local_src'] = df_screenplay['content'].apply(lambda x: self.string2hash(x)) image_selector = (df_screenplay['type'] == 'image') df_screenplay.loc[image_selector, 'filename'] = df_screenplay.loc[ image_selector, 'content'].apply(lambda x: path.basename(x)) df_screenplay.loc[image_selector, 'extname'] = df_screenplay.loc[ image_selector, 'filename'].apply(lambda x: path.splitext(x)[1]) df_screenplay = df_screenplay.fillna('') df_screenplay['download_name'] = df_screenplay['local_src'] + df_screenplay['extname'] df_screenplay['converted_name'] = df_screenplay['local_src'] + '.png' self.df_screenplay = df_screenplay return df_screenplay
def _update(self, response): data = Document(response.text).summary() doc = lxml.html.fromstring(data) images = [] for img in doc.xpath("//img"): src = urlparse.urljoin(response.url, img.get("src")) imgResp = requests.get(src) encoded = base64.b64encode(imgResp.content) if len(encoded) < 3000: src = "data:" + imgResp.headers[ "content-type"] + ";base64," + encoded else: md5 = hashlib.sha1() md5.update(encoded) name = md5.hexdigest() src = name + "." + src.rpartition(".")[2] images.append((src, encoded)) img.set("src", src) data = StringIO() data.write(lxml.etree.tostring(doc, pretty_print=True)) for (name, imageData) in images: data.write("\n--data:" + name + "\n" + imageData) data.seek(0) self.article = data.read() self.last_updated = datetime.now()
class Article: def __init__(self, url): print('Saving page: {}'.format(url)) res = requests.get(url) self.url = url self.article = Document(res.content) self._add_title() self._save_images() def _add_title(self): self.root = etree.fromstring(self.article.summary()) body = self.root.find('body') title = self.article.title() ascii_title = unidecode(title) if type(title) == unicode else title title_header = etree.HTML('<h2>{}</h2>'.format(ascii_title)) body.insert(0, title_header) def _save_images(self): tmppath = tempfile.mkdtemp() images = self.root.xpath('//img') for img in images: imgsrc = img.get('src') # handle scheme-agnostic URLs if 'http' not in imgsrc and '//' in imgsrc: imgsrc = 'http:{}'.format(imgsrc) # handle relative file paths elif 'http' not in imgsrc: parsed = urlparse(self.url) imgsrc = '{}://{}{}'.format(parsed.scheme, parsed.netloc, imgsrc) filename = os.path.basename(imgsrc) dest = os.path.join(tmppath, filename) try: res = requests.get(imgsrc) except Exception as e: print('Could not fetch image ({}) from "{}"'.format(str(e), imgsrc)) return if res.status_code == 404: print('Could not fetch image (HTTP 404), attempted fetch: "{}", source URL: {}'.format(imgsrc, img.get('src'))) continue with open(dest, 'wb') as f: f.write(res.content) img.set('src', dest) @property def title(self): return self.article.title() @property def html(self): return etree.tostring(self.root)
def extract(text): soup = BeautifulSoup(text, 'html.parser') # , from_encoding="utf8") aaa = soup.find('li', {'id': 'EntryTag'}) print aaa bbb = soup.find('div', {'id': 'BlogPostCategory'}) tag_str = '' print bbb soup1 = soup.find('div', {'id': 'cnblogs_post_body'}) if soup1: try: content = str(soup1) logging.info('find content in html tag') except: content = Document(text).summary() logging.info('conver soup to string error so via readability', exc_info=True) else: content = Document(text).summary() logging.info('find content via readability') try: aaaa = aaa.find_all('a') tag_list = [i2.get_text for i2 in aaaa] tag_str = ','.join(tag_list) aaab = bbb.find_all('a') tag_list2 = [i2.get_text for i2 in aaab] tag_str += ','.join(tag_list2) except Exception, e: # print Exception, e logging.error('cant find keyword in html', exc_info=True)
def extract_article(url): r = requests.get(url) # the the url exists, continue if r.status_code == 200: # extract and parse response url url = parse_url(r.url) # extract html html = r.content.decode('utf-8', errors='ignore') # run boilerpipe BP = Extractor(html=html) # run readability Rdb = Document(html) html = Rdb.summary() # return article data return { 'extracted_title': Rdb.short_title().strip(), 'extracted_content': strip_tags(BP.getText()), } # otherwise return an empty dict else: return {}
def getText(): dataList = [] for f in os.listdir('unsupervised\\documents'): filePath = 'unsupervised\\documents\\' + f #print filePath fileName, fileExtension = os.path.splitext(filePath) #print fileExtension if fileExtension.lower() == '.docx': print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension) doc = docxDocument(filePath) for p in doc.paragraphs: dataList.append(p.text) #print p.text #print "-------------------------------" elif fileExtension.lower() == '.pdf': print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension) #TODO elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')): print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension) with codecs.open (filePath, errors='ignore') as myfile: source = myfile.read() article = Document(source).summary() title = Document(source).title() soup = BeautifulSoup(article, 'lxml') final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', ''))) dataList.append(final) #print '*** TITLE *** \n\"' + title + '\"\n' #print '*** CONTENT *** \n\"' + soup.text + '[...]\"' else: print '' # 'undectected document type' print '' #"-------------------------------" return dataList
def contents_scraping(link, remove_space=True, remove_lb=True): """Scraping contents. Parameter --------- url : str Scraping target url. Return ------ list : title and contents. """ try: html = urllib.request.urlopen(link).read() except: print("ERROR : failed to get contents. -> " + link) return (False, "") title = Document(html).short_title() contents = Document(html).summary() contents = html2text.html2text(contents) p = re.compile(r"<[^>]*?>") c = p.sub("", contents) if remove_space is True: c = c.replace(" ", "") if remove_lb is True: c = c.replace("\r", "") c = c.replace("\n", "") return title, c
def recommend_by_url(url): parsed = urlparse(url) doc = Document(requests.get(url).content) content = html.fromstring(doc.content()).xpath('string()') bigrams = make_bigrams(content) vec_bow = dictionary.doc2bow(bigrams) vec_lsi = lsi[vec_bow] sims = index[vec_lsi] #print sims docs = sorted(list(enumerate(sims)), key=lambda item: -item[1]) results, seen = [], [] for doc, score in docs: res = ARTICLES[doc] if not 'url' in res or res['url'] in seen: continue seen.append(res['url']) p = urlparse(res['url']) if p.hostname.endswith(parsed.hostname): continue res['score'] = float(score) if 'content' in res: del res['content'] if 'html' in res: del res['html'] if res['summary']: res['summary'] = res['summary'].strip() results.append(res) if len(results) > 14: break return results
class Gist: keyword_pattern = re.compile(r'^[^\d]+$') stop_words = set(get_stop_words('en')) def __init__(self, html): self.html = html self.document = Document(html) @property def title(self): return self.document.short_title() @cached_property def text(self): text = self.document.summary() text = re.sub('<br[^>]+>', '\n', text) text = re.sub('</?p[^>]+>', '\n\n', text) text = re.sub('<[^>]+>', '', text) text = re.sub('^[ \t]+$', '', text) text = re.sub('\n{3,}', '\n\n', text, flags=re.MULTILINE) return text @staticmethod def _common_prefix(one, two): parallelity = [x == y for x, y in zip(one, two)] + [False] return parallelity.index(False) @classmethod def _find_representative(cls, stem, text): tokens = text.split() prefixes = {token: cls._common_prefix(token, stem) for token in tokens} best = lambda token: (-token[1], len(token[0])) return sorted(prefixes.items(), key=best)[0][0] @classmethod def _is_good_keyword(cls, word): return (word not in cls.stop_words) and \ cls.keyword_pattern.match(word) @classmethod def find_keywords(cls, text): whoosh_backend = SearchForm().searchqueryset.query.backend if not whoosh_backend.setup_complete: whoosh_backend.setup() with whoosh_backend.index.searcher() as searcher: keywords = searcher.key_terms_from_text( 'text', text, numterms=10, normalize=False) keywords = list(zip(*keywords))[0] if keywords else [] keywords = [cls._find_representative(keyword, text) for keyword in keywords] keywords = [keyword for keyword in keywords if cls._is_good_keyword(keyword)] #no double keywords in list keywords = list(set(keywords)) #no punctuation in suggested keywords keywords = [''.join(c for c in s if c not in string.punctuation) for s in keywords] return keywords @property def keywords(self): return self.find_keywords(self.text)
def get_announcement_body(url): now = datetime.datetime.now() resp = ["","","","","",""] images = [] html = br.open(url).read() readable_announcement = Document(html).summary() readable_title = Document(html).title() soup = BeautifulSoup(readable_announcement, "lxml") final_announcement = soup.text links = soup.findAll('img', src=True) for lin in links: li = urlparse.urljoin(url,lin['src']) images.append( li) resp[0] = str(final_announcement.encode("ascii","ignore")) resp[1] = str(readable_title.encode("ascii","ignore")) resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second) resp[3] = url resp[4] = url resp[5] = "" #insertDB(resp) #print "inserted resp" title_article = [] title_article.append(final_announcement) title_article.append(readable_title) title_article.append(images) return title_article
def process(doc, params): url = params['url'] html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir='/tmp/') img_src = urljoin(url, img.get('src')) img_name = None if re.search(r'http[s]?://', img_src): r = requests.get(img_src, stream=True) img_name = get_filename_from_url(img_src) write_file(r, fp) else: img_meta, content = img_src.split(',') image = base64.b64decode(content) img_name = get_filename_from_base64(img_meta) fp.write(image) images.append((img_name, fp)) except Exception: logger.error( 'extractor.formats.html Image Collector Error!!', exc_info=True, extra={'data': {'url': url}}, ) html = '<h1>' + title + '</h1>' + summary html = '<p>{}</p>'.format(html) text = html2text.html2text(html) return text, images, 1, None
def get_data(url): error_num = 0 while True: if error_num >= 10: cprint("Finished Because error_num reached 10 times", "red") return 0, 0 try: req = requests.get(url) if int(req.status_code) == 503: cprint("Google detected the abnormal network traffic", "red") time.sleep(60 * 60) elif int(req.status_code) != 200: cprint("Now Get StatusCode{}: Error_num{}".format(req.status_code, error_num), "red") return 0, 0 else: html = req.text break except ConnectionError: cprint("Now Get ConnectionError: Error_num{}".format(error_num), "red") error_num += 1 time.sleep(5) try: document = Document(html) content_html = document.summary() content_text = lxml.html.fromstring(content_html).text_content().strip() short_title = document.short_title() return short_title, content_text except: return 0, 0
def get_webpage_by_html(url, html=None): html = get_html_str(url, html) summary_obj = predefined_site(url, html) article = video_site(url) if summary_obj is None: doc = Document(html, url=url, debug=True, multipage=False) summary_obj = doc.summary_with_metadata(enclose_with_html_tag=False) title = summary_obj.short_title if article is None: article = summary_obj.html from urllib.parse import urlparse webpage = Webpage() webpage.url = url webpage.domain = urlparse(url).hostname webpage.title = title webpage.favicon = "" webpage.top_image = None webpage.excerpt = summary_obj.description webpage.author = None webpage.content = article webpage.tags = get_suggest_tags(title, article, summary_obj.keywords) webpage.movies = [] webpage.raw_html = html webpage.publish_date = None webpage.segmentation = get_segmentation(title, article) return webpage.__dict__
def crawl(site, depth, linksfile): pattern = re.compile(r'href="(http://.*?)"') f = open(linksfile, 'a+') try: if depth < MAX_DEPTH: print 'crawling [%s]...' % site, print >> f, '[%s]' % site br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Firefox')] url = br.open(site) content = url.read() hits = pattern.findall(content) for hit in hits: print >> f, hit url2 = br.open(hit) content2 = url.read() readable_article = Document(content2).summary() readable_title = Document(content).short_title() soup = BeautifulSoup(readable_article) final_article = soup.text links = soup.findAll('img', src=True) print final_article print 'done.' print >> f, '' for hit in hits: crawl(hit, depth + 1, linksfile) except: pass f.close()
def loadFromWeb(cls,url): html = requests.get(url).content readable_article = Document(html).summary() readable_title = Document(html).short_title() cleantext = BeautifulSoup(readable_article).text cleantext = HTMLParser.HTMLParser().unescape(cleantext) return cleantext
async def enrich(self, result): if not self.soup: return result result.set('title', self.soup.title.string, 0, 'textlength') if result.has('content'): return result parts = [] for txt in self.soup.find_all("noscript"): if txt.string is not None: parts.append(txt.string) html = " ".join(parts).strip() if not html: html = self.soup.all_text() try: doc = Document(html, url=self.url) content = doc.summary(html_partial=True) result.set('content', sanitize_html(content)) # pylint: disable=bare-except except: pass return result
def process(doc, url): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR) img_src = urljoin(url, img.get('src')) if re.search(r'http[s]?://', img_src): r = requests.get(img_src, stream=True) write_file(r, fp) else: image = base64.b64decode(img_src.split(',')[1]) fp.write(image) images.append(fp) except Exception: logger.error( 'extractor.formats.html Image Collector Error!!', exc_info=True, extra={'data': { 'url': url }}, ) html = '<h1>' + title + '</h1>' + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = '<p>{}</p>'.format(regex.sub('', html)) soup = BeautifulSoup(html, 'lxml') text = _get_plain_text(soup) return text, images, 1
def main(): novels = { 'cbi': 'https://boxnovel.com/novel/castle-of-black-iron/chapter-', 'sgg': 'https://boxnovel.com/novel/super-gene/chapter-', 'sas': 'https://boxnovel.com/novel/strongest-abandoned-son/chapter-', 'atg': 'https://www.wuxiaworld.com/novel/against-the-gods/atg-chapter-' } total = [] if len(sys.argv) < 4: inicio = int(sys.argv[2]) fim = int(sys.argv[2]) + 1 else: inicio = int(sys.argv[2]) fim = int(sys.argv[3]) + 1 url = novels[sys.argv[1]] for i in range(inicio, fim): response = getPage(url + str(i)) doc = Document(response.text) fileName = re.sub(r'[^a-zA-Z0-9]+', ' ', doc.title()) total.append(doc.summary()) print(i) f = open(fileName + str(fim - 1) + '.html', 'w') for i in total: f.write(i) f.close()
def _parse_article(self, response): feed_entry = response.meta["feed_entry"] il = FeedEntryItemLoader(parent=response.meta["il"]) try: response.text except AttributeError: # Response is not text (e.g. PDF, ...). il.add_value("title", feed_entry.get("title")) il.add_value("content_html", feed_entry.get("summary")) return il.load_item() doc = Document(response.text, url=response.url) il.add_value("title", doc.short_title() or feed_entry.get("title")) summary = feed_entry.get("summary") try: content = doc.summary(html_partial=True) if summary and len(summary) > len(content): # Something probably went wrong if the extracted content is shorter than # the summary. raise Unparseable except Unparseable: content = summary il.add_value("content_html", content) return il.load_item()
def getTextFromHTML(self, url_id): """ Runs Readability (Document) on the HTML text """ html_row = get_html(self.pg_conn, url_id) if not html_row or 'html' not in html_row: return False if html_row['readabletext'] and html_row['readabletext'] != '': return html_row['readabletext'] html = html_row['html'] try: html_summary = Document(html).summary(html_partial=True) html_summary = html_summary.replace('\n','').replace('\t','') if len(html_summary) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary: return False raw_text = lxml.html.document_fromstring(html_summary).text_content() except: raw_text = False if raw_text: save_readabletext(self.pg_conn, url_id, raw_text, 'meta') else: save_readabletext(self.pg_conn, url_id, '', 'meta') return raw_text
def getTextFromHTML(self, url_id): """ Runs Readability (Document) on the HTML text """ html_row = get_html(self.pg_conn, url_id) if not html_row or 'html' not in html_row: return False if html_row['readabletext'] and html_row['readabletext'] != '': return html_row['readabletext'] html = html_row['html'] try: html_summary = Document(html).summary(html_partial=True) html_summary = html_summary.replace('\n', '').replace('\t', '') if len( html_summary ) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary: return False raw_text = lxml.html.document_fromstring( html_summary).text_content() except: raw_text = False if raw_text: save_readabletext(self.pg_conn, url_id, raw_text, 'meta') else: save_readabletext(self.pg_conn, url_id, '', 'meta') return raw_text
def reada(url, cache=True): if cache: cached = memcache.get(key=url) if cached is not None: return cached #file = urllib.urlopen(url) #import urllib2 opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] file = opener.open(url) ## enc = 'utf-8' text = '' try: # 1, web html 2 readability raw = Document(file.read(), url=url) html = raw.summary().encode(enc, 'replace') title = raw.short_title() # 2, readability 2 markdown, copy from main data = html.decode(enc) h = html2text.HTML2Text(baseurl=url) h.ignore_images = False h.body_width = 100000 text = h.handle(data) finally: file.close() d = {'url': url, 'title': title, 'content': text} if cache: memcache.add(key=url, value=d, time=600) return d
def getTitleAndContent(self, contentUrl): myHeader = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:55.0) Gecko/20100101 Firefox/55.0', } try: r = self.http.request('GET', contentUrl, headers=myHeader) # print(r.status) # 200 # 获得html源码,utf-8解码 # print(r.data.decode()) html = r.data readable_tilte = Document(html).short_title() readable_article = Document(html).summary() content = self.ht.handle(readable_article) # content = re.sub(r'阅读剩余全文()|该菜谱创建于[\s\S]+任何部分的内容。|(更多相关资讯请关注:|用手机访问|1[\s\d]+\s下一页|\*\s|精美图片)[\s\S]+|(新闻热线:[\s\S]+)#', '', content) response = etree.HTML(html) # content = response.xpath("string(//div[@class='text-3zQ3cZD4'])") # content = re.sub( # r'图集|(\+1\s|【纠错】)[\s\S]+', '', # content).strip() # script = response.xpath("//script")[5].text # response = re.findall('contentList":([\s\S]+),"currentPage', script)[0] # datas = json.loads(response)[0] # strData = datas['data'] # pat = re.compile('<[^>]+>', re.S) # content = pat.sub('', strData) # content = ''.join(content).replace(u'\u3000', '').replace(u'\xa0','').strip() data = dict() data["title"] = readable_tilte data["content"] = content return self.return_data(0, "success", data) except Exception as e: return self.return_data(1, e)
def getContent(url): print '@@ start crawl %s @@@' % url html = getHTml(url) '''readability介入分析''' readable_article = Document(html).summary() readable_title = Document(html).short_title() a = re.sub(r'<script[\s\S]*?</script>| ', '', readable_article).strip() b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>', '', a).strip() c = re.sub(r'<p[^>]*?>', '<p>', b).strip().replace('\n', '') d = re.sub(r'<p>\s+<p>', '', c) # 统计中文字数 num = number(b) if num > 100: #sql = '''INSERT INTO newbaidu_detail_contont VALUES ('%s','%s','%s','%s')''' % (url,readable_title,d,current_date) getc = url + '\n' + readable_title + '\n' + d + '\n' + current_date + '\n' try: with open('news/' + readable_title + '.txt', 'w') as f2: f2.write(getc) print '执行成功' except Exception, e: print '执行失败,%s' % e return '成功'
def main(): #print 'Hello there' # Command line args are in sys.argv[1], sys.argv[2] ... # sys.argv[0] is the script name itself and can be ignored dataList = [] for f in os.listdir('documents'): filePath = 'documents\\' + f #print filePath fileName, fileExtension = os.path.splitext(filePath) #print fileExtension if fileExtension.lower() == '.docx': print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension) doc = docxDocument(filePath) for p in doc.paragraphs: dataList.append(p.text) #print p.text #print "-------------------------------" elif fileExtension.lower() == '.pdf': print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension) # with open(filePath) as f: # doc = slate.PDF(f) # print doc[1] # exit() #TODO elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')): print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension) with codecs.open (filePath, errors='ignore') as myfile: source = myfile.read() article = Document(source).summary() title = Document(source).title() soup = BeautifulSoup(article, 'lxml') final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', ''))) dataList.append(final) #print '*** TITLE *** \n\"' + title + '\"\n' #print '*** CONTENT *** \n\"' + soup.text + '[...]\"' else: print '' # 'undectected document type' print '' #"-------------------------------" #print dataList #for i in dataList: # print i cachedStopWords = stopwords.words("english") combined = ' '.join(dataList) #print combined bloblist = [tb(combined)] for i, blob in enumerate(bloblist): print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')} #print scores sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) #print sorted_words for word, score in sorted_words: print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
def download_via_url(url): response = requests.get(url) doc = Document(response.text) title = doc.title() summary = doc.summary() soup = BeautifulSoup(summary, "html.parser") return title, soup.text
def extract(self, html): # https://github.com/buriy/python-readability/blob/master/readability/readability.py doc = Document(html) self.__title = doc.title() self.__html = doc.summary() self.__md = html2text.html2text(self.__html) self.__text = self.__format_to_text(self.__html) return self.__text
def process(doc): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() text = text_maker.handle(summary) return title, text
def main(): html = urllib.urlopen("http://habrahabr.ru/post/150756/").read() doc = Document(html) short_title = doc.short_title() readable_article = doc.summary() f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb") f.write(readable_article.encode("utf-8")) f.close()
def parse(self, response): doc = Document(response.text) yield { 'full_title': doc.title(), # 'date': response.selector.xpath('//time/@datetime').getall() # 'date': response.xpath('//span[@class="post-date"]/text()').get() 'date': '2009' }
def _getResponseText(self, response): ''' (reponse) -> Text Returns text within the body of an HttpResponse object. ''' readability = Document(response.body) content = readability.title() + readability.summary() return content
def get_main_content(html): readable_title = Document(html).short_title() readable_article = Document(html).summary() text_p = re.sub(r'</?div.*?>', '', readable_article) text_p = re.sub(r'((</p>)?<a href=.*?>|</a>(<p>)?)', '', text_p) text_p = re.sub(r'<select>.*?</select>', '', text_p) return readable_title, text_p
def checkerFunction(myInput): today = datetime.date.today() try: google1 = 'http://www.google.com/search?hl=en&q=' google2 = '%20privacy%20policy&btnI=1' keyword = myInput url = google1 + keyword + google2 r = requests.get(url, allow_redirects=False) url = r.headers['location'] except Exception as e: return myFullPath = "./sandbox/db/" + keyword if not os.path.exists("./sandbox"): os.makedirs("./sandbox") if not os.path.exists("./sandbox/db/"): os.makedirs("./sandbox/db/") if not os.path.exists(myFullPath): os.makedirs(myFullPath) filename = keyword + "." + str(today) filetowrite = myFullPath + "/" + filename fileExist = os.path.isfile(filetowrite) if (url == None): return html = urllib.urlopen(url).read() readable_article = Document(html).summary() tempFileMade = False originalFileMade = False if(fileExist): filetowrite = filetowrite + ".tmp." f = open(filetowrite, 'w') writeThis = str(readable_article.encode('ascii', 'ignore')) f.write(writeThis) f.close tempFileMade = True else: f = open(filetowrite, 'w') writeThis = str(readable_article.encode('ascii', 'ignore')) f.write(writeThis) f.close originalFileMade = True hashedmd5 = hashlib.md5(readable_article.encode('ascii', 'ignore')) hashedArticle = hashedmd5.hexdigest() return hashedArticle
def get_article_from_item(self, item): url = item['link'] logging.debug(url) author = 'n/a' if item.has_key('author'): author = item.author html = urllib.urlopen(url).read() doc = Document(html) return Article(doc.title(), doc.short_title(), author, doc.summary())
def news(): search = request.args.get('q') if request.args.get('count') : count = request.args.get('count') else : count=10 if request.args.get('offset') : offset = request.args.get('offset') else : offset=0 if search: headers = {'Ocp-Apim-Subscription-Key': 'd94125558b884a309dd71f9e1aa8b9fb'} params = urllib.parse.urlencode({ 'q': search, 'count': count, 'offset': offset, 'mkt': 'en-id', 'safesearch': 'Moderate', }) try: conn_url = http.client.HTTPSConnection('api.cognitive.microsoft.com') conn_url.request("GET", "/bing/v7.0/news/search?%s" % params, "{body}", headers) response = conn_url.getresponse() data = response.read().decode('utf-8') data_array = json.loads(data) conn_url.close() except Exception as e: print("[Errno {0}] {1}".format(e.errno, e.strerror)) print(data_array) i=0 for result in data_array['value']: try: response = requests.get(result['url'], verify=False, allow_redirects=False) except requests.exceptions.ConnectionError: print(result['url'], "Connection refused") response = requests.get("https://pens.ac.id", verify=False) print(result['url']) doc = Document(response.content) raw = BeautifulSoup(doc.summary(html_partial=True), 'html.parser').get_text() result['sentiment'] = int(getSentiment(raw)) print("SENTIMENT : ", result['sentiment']) result['status'] = analyze(raw) result['id_rank'] = i if result['datePublished']: result['datePublished'] = parser.parse(result['datePublished']) result['datePublished'] = result['datePublished'].strftime('Diterbitkan pada %d %b %Y pukul %I:%M WIB') print(result['datePublished']) i+=1 return render_template("news.html", data=data_array) else: return render_template("news.html")
def getReadability(url): #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html' try: html = urllib.urlopen(url).read() readable_article = Document(html).summary().replace('\n', '') readable_title = Document(html).short_title() return readable_title, readable_article except Exception, e: return '', ''
def crawl_url(url): html = requests.get(url) doc = Document(html.content) content = doc.summary().encode('utf-8') title = doc.title().encode('utf-8') return { 'content': content, 'title': title }
def get_article (url, referrer=None): """Fetch the html found at url and use the readability algorithm to return just the text content""" html = load_url(url, referrer) if html is not None: doc_html = Document(html).summary(html_partial=True) clean_html = doc_html.replace('&', u'&').replace(u' ', u'\n') return BeautifulSoup(clean_html).getText(separator=u' ').replace(u' ', u' ')
def scrape(url, pdf_filename, pdf_page_size=PDF_PAGE_SIZE, folder=OUTPUT_FOLDER, clean_it=True, css_file=EPUB_CSS, lang=EPUB_LANG, cover_image=EPUB_COVER, isbn=None): """Fetch the html content at url and convert it to a pdf file, cleaned by readability and framed in an easy-to-read format if clean_it is True""" raw_html = get_url(url) if raw_html is None: print "Sorry, could not read ", url else: filename_prefix, file_ext = os.path.splitext(pdf_filename) if clean_it: # use readability to get rid of crap title = Document(raw_html).short_title() content = Document(raw_html).summary(html_partial=True) # write the cleaned contents to an html frame for pdf conversion frame = HTML_FRAME.substitute(content=to_unicode(content), url=url, title=title) # unlike pdf, epub is controlled by css, so save the cleaned html alone epub_source = write_file( folder, os.extsep.join([filename_prefix + '_epub', 'html']), to_unicode(content)) pdf_source = write_file(folder, os.extsep.join([filename_prefix, 'html']), frame) else: title = filename_prefix # no readability cleaning requested, so use the fetched html as-is epub_source = write_file( folder, os.extsep.join([filename_prefix + '_epub', 'html']), to_unicode(raw_html)) pdf_source = write_file(folder, os.extsep.join([filename_prefix, 'html']), to_unicode(raw_html)) if epub_source: generate_epub( folder, filename_prefix, title, os.path.join( folder, os.extsep.join([filename_prefix + '_epub', 'html'])), css_file, cover_image, lang, isbn) if pdf_source: generate_pdf(folder, filename_prefix, pdf_page_size)
def body_via_readability(page_html, source_url): """ Readbility is good at article + title. """ obj = Document(page_html) body = obj.summary() if not body: return None return html.prepare(body, source_url)
def fetch_url(url): ''' get url with readability ''' html = basic_fetch_url(url) readable_article = Document(html).summary() title = Document(html).short_title() text = BeautifulSoup(readable_article).get_text() return title, text
def readability_extractor(self, html): try: doc = Document(html) content = doc.summary() if content and content != "": return content else: return self.html2text_extractor(html) except: return self.html2text_extractor(html)
def extract_article(self): """Returns only readable content Returns: data - { 'title': 'Title of the article', 'content': 'HTML body of the article' } """ doc = Document(self._html) return {'title': doc.title(), 'content': doc.summary()}
def extract_data(self, patchurl): try: f = requests.get(patchurl) html = f.content doc = Document(html) title = doc.short_title() summary = doc.summary() return smart_str(title), smart_str(summary) except: return None, None
def extract_by_readability(html): document = Document(html) def strip_html(html): return re.sub(r'<[^<]+?>', '', html) return { 'title': ensure_unicode(document.short_title()), 'body': strip_html(ensure_unicode(document.summary())), }
def decode_doc(doc, url): #print('doc') cs = re.compile(b'^<(meta|META).*charset=("|\')?([^ "\']*)') pkey = re.compile(b'^<(meta|META).*keywords.*content=("|\')?([^ "\']*)') codec = None keywords = None #print(*doc) for l in doc : if (l.startswith(b'<meta') or l.startswith(b'<META')) : if codec is None and (b'charset' in l) : m = cs.match(l) codec = m.group(3).decode() if keywords is None and b'keywords' in l : m = pkey.match(l) if m : keywords = m.group(3) sdoc = [] for l in doc : try : l = l.decode(codec) except : l = '' sdoc.append(l) try : if keywords : keywords = keywords.decode(codec) else : #print(*sdoc, sep = '\n') keywords = '' keywords = re.split(r'[ ,;\|]',keywords) #print(keywords.encode('utf8')) except : pass #if sum(len(x) for x in sdoc) < 1000 : return doc = '\n'.join(sdoc) #if len(doc) < 1000 :return try : doc = Document(doc) title = doc.short_title() content = doc.summary() except : return #print(doc.summary().encode('utf8')) #print(doc.short_title().encode('utf8')) data = {"url":url, 'keywords':keywords, 'title': title, 'content':content} return data
def main(): html = open('./samples/21853124_0.shtml').read() doc = Document(html) doc.transform() doc.get_publish_date() doc.short_title() doc.text_content()
def _update(self, response): app.logger.debug("Updating %s" % response.url) data = Document(response.text).summary() doc = lxml.html.fromstring(data) images = [] imageElems = doc.xpath("//img") app.logger.debug("%d images for %s",len(imageElems), response.url) for img in imageElems: src = urlparse.urljoin(response.url, img.get("src")) imgResp = requests.get(src) encoded = base64.b64encode(imgResp.content) if len(encoded) < 3000: src = "data:" + imgResp.headers["content-type"] + ";base64," + encoded else: md5 = hashlib.sha1() md5.update(encoded) name = md5.hexdigest() src = name +"." + src.rpartition(".")[2] images.append((src, encoded)) img.set("src", src) data = StringIO() data.write(lxml.etree.tostring(doc, pretty_print=True)) for (name, imageData) in images: data.write("\n--data:"+name+"\n"+imageData) data.seek(0) self.article = data.read() self.save()
def get_main_text(html): main_text = Document(html).summary() main_text = BeautifulSoup(main_text).getText() # 处理空行 r = re.compile(r'\n+', re.M | re.S) main_text = r.sub('\n', main_text) # 去除首行回车 if main_text.find('\n') == 0: main_text = main_text.replace('\n', '', 1) return main_text
def parse_news_content(self, response): for link in self.full_article_link_extractor.extract_links(response): request = response.request.replace(url=link.url) yield request item = self._create_item(response) if item is not None: doc = Document(response.body) item['title'] = doc.short_title() item['content'] = html2text.html2text(doc.summary()) yield item
def tell_url(un, url): buff = urllib2.urlopen(url) doc = Document(buff.read()) html_buff = doc.summary() text_buff = extract_text(html_buff) class_name = un.tell_buff(text_buff) if class_name: class_name_human = un.get_class_name_human(class_name) else: class_name_human = None return class_name_human