def loadFromWeb(cls,url): html = requests.get(url).content readable_article = Document(html).summary() readable_title = Document(html).short_title() cleantext = BeautifulSoup(readable_article).text cleantext = HTMLParser.HTMLParser().unescape(cleantext) return cleantext
def extract(text): soup = BeautifulSoup(text, 'html.parser') # , from_encoding="utf8") aaa = soup.find('li', {'id': 'EntryTag'}) print aaa bbb = soup.find('div', {'id': 'BlogPostCategory'}) tag_str = '' print bbb soup1 = soup.find('div', {'id': 'cnblogs_post_body'}) if soup1: try: content = str(soup1) logging.info('find content in html tag') except: content = Document(text).summary() logging.info('conver soup to string error so via readability', exc_info=True) else: content = Document(text).summary() logging.info('find content via readability') try: aaaa = aaa.find_all('a') tag_list = [i2.get_text for i2 in aaaa] tag_str = ','.join(tag_list) aaab = bbb.find_all('a') tag_list2 = [i2.get_text for i2 in aaab] tag_str += ','.join(tag_list2) except Exception, e: # print Exception, e logging.error('cant find keyword in html', exc_info=True)
def contents_scraping(link, remove_space=True, remove_lb=True): """Scraping contents. Parameter --------- url : str Scraping target url. Return ------ list : title and contents. """ try: html = urllib.request.urlopen(link).read() except: print("ERROR : failed to get contents. -> " + link) return (False, "") title = Document(html).short_title() contents = Document(html).summary() contents = html2text.html2text(contents) p = re.compile(r"<[^>]*?>") c = p.sub("", contents) if remove_space is True: c = c.replace(" ", "") if remove_lb is True: c = c.replace("\r", "") c = c.replace("\n", "") return title, c
def getTitleAndContent(self, contentUrl): myHeader = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:55.0) Gecko/20100101 Firefox/55.0', } try: r = self.http.request('GET', contentUrl, headers=myHeader) # print(r.status) # 200 # 获得html源码,utf-8解码 # print(r.data.decode()) html = r.data readable_tilte = Document(html).short_title() readable_article = Document(html).summary() content = self.ht.handle(readable_article) # content = re.sub(r'阅读剩余全文()|该菜谱创建于[\s\S]+任何部分的内容。|(更多相关资讯请关注:|用手机访问|1[\s\d]+\s下一页|\*\s|精美图片)[\s\S]+|(新闻热线:[\s\S]+)#', '', content) response = etree.HTML(html) # content = response.xpath("string(//div[@class='text-3zQ3cZD4'])") # content = re.sub( # r'图集|(\+1\s|【纠错】)[\s\S]+', '', # content).strip() # script = response.xpath("//script")[5].text # response = re.findall('contentList":([\s\S]+),"currentPage', script)[0] # datas = json.loads(response)[0] # strData = datas['data'] # pat = re.compile('<[^>]+>', re.S) # content = pat.sub('', strData) # content = ''.join(content).replace(u'\u3000', '').replace(u'\xa0','').strip() data = dict() data["title"] = readable_tilte data["content"] = content return self.return_data(0, "success", data) except Exception as e: return self.return_data(1, e)
def getContent(url): print '@@ start crawl %s @@@' % url html = getHTml(url) '''readability介入分析''' readable_article = Document(html).summary() readable_title = Document(html).short_title() a = re.sub(r'<script[\s\S]*?</script>| ', '', readable_article).strip() b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>', '', a).strip() c = re.sub(r'<p[^>]*?>', '<p>', b).strip().replace('\n', '') d = re.sub(r'<p>\s+<p>', '', c) # 统计中文字数 num = number(b) if num > 100: #sql = '''INSERT INTO newbaidu_detail_contont VALUES ('%s','%s','%s','%s')''' % (url,readable_title,d,current_date) getc = url + '\n' + readable_title + '\n' + d + '\n' + current_date + '\n' try: with open('news/' + readable_title + '.txt', 'w') as f2: f2.write(getc) print '执行成功' except Exception, e: print '执行失败,%s' % e return '成功'
def get_screen_play(self, url): """Download webpage and analyze basic sequence :param url: :return: """ res = requests.get(url) html = res.content.decode('utf-8') # Analyze basic sequence readable_article = Document(html).summary() self.readable_article = readable_article readable_title = Document(html).title() self.readable_title = readable_title base_url = path.dirname(res.request.url) result = Extractor(base_url).html_to_asset_list(readable_article) #print(result) df_screenplay = pd.DataFrame(result, columns=['type', 'content']) df_screenplay['local_src'] = df_screenplay['content'].apply(lambda x: self.string2hash(x)) image_selector = (df_screenplay['type'] == 'image') df_screenplay.loc[image_selector, 'filename'] = df_screenplay.loc[ image_selector, 'content'].apply(lambda x: path.basename(x)) df_screenplay.loc[image_selector, 'extname'] = df_screenplay.loc[ image_selector, 'filename'].apply(lambda x: path.splitext(x)[1]) df_screenplay = df_screenplay.fillna('') df_screenplay['download_name'] = df_screenplay['local_src'] + df_screenplay['extname'] df_screenplay['converted_name'] = df_screenplay['local_src'] + '.png' self.df_screenplay = df_screenplay return df_screenplay
def crawl(site, depth, linksfile): pattern = re.compile(r'href="(http://.*?)"') f = open(linksfile, 'a+') try: if depth < MAX_DEPTH: print 'crawling [%s]...' % site, print >> f, '[%s]' % site br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Firefox')] url = br.open(site) content = url.read() hits = pattern.findall(content) for hit in hits: print >> f, hit url2 = br.open(hit) content2 = url.read() readable_article = Document(content2).summary() readable_title = Document(content).short_title() soup = BeautifulSoup(readable_article) final_article = soup.text links = soup.findAll('img', src=True) print final_article print 'done.' print >> f, '' for hit in hits: crawl(hit, depth + 1, linksfile) except: pass f.close()
def get_main_content(html): readable_title = Document(html).short_title() readable_article = Document(html).summary() text_p = re.sub(r'</?div.*?>', '', readable_article) text_p = re.sub(r'((</p>)?<a href=.*?>|</a>(<p>)?)', '', text_p) text_p = re.sub(r'<select>.*?</select>', '', text_p) return readable_title, text_p
def getReadability(url): #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html' try: html = urllib.urlopen(url).read() readable_article = Document(html).summary().replace('\n', '') readable_title = Document(html).short_title() return readable_title, readable_article except Exception, e: return '', ''
def scrape(url, pdf_filename, pdf_page_size=PDF_PAGE_SIZE, folder=OUTPUT_FOLDER, clean_it=True, css_file=EPUB_CSS, lang=EPUB_LANG, cover_image=EPUB_COVER, isbn=None): """Fetch the html content at url and convert it to a pdf file, cleaned by readability and framed in an easy-to-read format if clean_it is True""" raw_html = get_url(url) if raw_html is None: print "Sorry, could not read ", url else: filename_prefix, file_ext = os.path.splitext(pdf_filename) if clean_it: # use readability to get rid of crap title = Document(raw_html).short_title() content = Document(raw_html).summary(html_partial=True) # write the cleaned contents to an html frame for pdf conversion frame = HTML_FRAME.substitute(content=to_unicode(content), url=url, title=title) # unlike pdf, epub is controlled by css, so save the cleaned html alone epub_source = write_file( folder, os.extsep.join([filename_prefix + '_epub', 'html']), to_unicode(content)) pdf_source = write_file(folder, os.extsep.join([filename_prefix, 'html']), frame) else: title = filename_prefix # no readability cleaning requested, so use the fetched html as-is epub_source = write_file( folder, os.extsep.join([filename_prefix + '_epub', 'html']), to_unicode(raw_html)) pdf_source = write_file(folder, os.extsep.join([filename_prefix, 'html']), to_unicode(raw_html)) if epub_source: generate_epub( folder, filename_prefix, title, os.path.join( folder, os.extsep.join([filename_prefix + '_epub', 'html'])), css_file, cover_image, lang, isbn) if pdf_source: generate_pdf(folder, filename_prefix, pdf_page_size)
def fetch_url(url): ''' get url with readability ''' html = basic_fetch_url(url) readable_article = Document(html).summary() title = Document(html).short_title() text = BeautifulSoup(readable_article).get_text() return title, text
def parse_post(self, response): #def parse(self, response): dom = PyQuery(response.body) res = [] item = AvnpcPostItem() item['title'] = Document(response.body).summary() item['url'] = response.url item['content'] = Document(response.body).summary() return [item]
def textualize(path): """ Opens an HTML file on disk and cleans up the tags to get the text """ with codecs.open(path, 'r', 'utf8') as f: html = f.read() article = Document(html).summary() title = Document(html).title() soup = BeautifulSoup(article) return title, soup.text
def run1(): db = MySQLdb.connect(**common.sql_config) cursor = db.cursor(MySQLdb.cursors.SSCursor) sql_1 = """select id, url, content from news """ cursor.execute(sql_1) print cursor.rowcount i = 0 row = True row = cursor.fetchone() while row is not None: i += 1 if i % 100 == 0: print i, 666666666666666 row = cursor.fetchmany(size=500) # print row for row_id, url, content in row: # print row_id if comb(content, 250) and 'v2ex.com' not in url: # print content, 111111111111111111111 r = common.get_request(url) if r.url.startswith('http://mp.weixin.qq.com/'): soup2 = BeautifulSoup(r.text, 'html.parser') title = soup2.find('title').get_text().encode('utf8') content = soup2.find('div', {'class': 'rich_media_content'}) content = unicode(content).encode('utf8') else: content = Document(r.text.encode( r.encoding, 'ignore')).summary().encode('utf-8') title = Document(r.text.encode( r.encoding)).short_title().encode('utf-8') db2 = MySQLdb.connect(**common.sql_config) cursor2 = db2.cursor() if not comb(content, 250) and 'mp.weixin.qq.com' in url: sql = """update news set rating = 0, content = '{}' where id = '{}'""".format( db2.escape_string(content), row_id) print 2222222222 else: sql = """update news set rating = -1, content = '{}' where id = '{}' """.format( db2.escape_string(content), row_id) try: cursor2.execute(sql) db2.commit() except Exception, e: print e db2.rollback() db.ping(True) db2.close() print row_id, 777777777777777777777 print url
def url_matcher(event, url, *args, **kwargs): html = requests.get(url).text readable_article = Document(html).summary().encode("utf-8") readable_article = TAG_RE.sub('', readable_article) readable_article = WHITESPACE_RE.sub(' ', readable_article) readable_article = readable_article.replace('\n', ' ') readable_article = readable_article.replace(' ', '') if len(readable_article) > 75: readable_article = readable_article[:75] + '...' readable_title = Document(html).short_title().encode("utf-8") return "> " + url + " > " + readable_title + " > " + readable_article
def cleanHtmlToText(html): '''clean html and return list of words to make it brainspeed readable''' title = Document(html).short_title() html = Document(html).summary() soup = BeautifulSoup(html) for script in soup(["script", "style"]): script.extract() # rip it out text = soup.get_text() dicText = {'text': text, 'title': title} return dicText
def get_article(url): try: html = urllib.urlopen(url).read() if html: readable_article = Document(html).summary().encode( 'utf-8', 'ignore') readable_title = Document(html).short_title().encode( 'utf-8', 'ignore') return readable_title, readable_article except EOFError: print 'Error fetching %s: %s' % (url, EOFError) except Exception as e: print 'Error fetching %s: %s' % (url, e) return None, None
def url_matcher(self, msg, match): url = match.group(0) r = requests.head(url) max_size = self.config['DOC_MAX_SIZE'] max_len = self.config['DOC_MAX_LEN'] # files that are too big cause trouble. Let's just ignore them. if 'content-length' in r.headers and \ int(r.headers['content-length']) > max_size: return # ignore anything that is not allowed in configuration allowed_content_types = self.config['ALLOWED_CONTENT_TYPES'] content_type = '' if 'content-type' in r.headers: content_type = re.sub(r'\s*\;.*$', '', r.headers['content-type']) content_type = content_type.strip() if content_type not in allowed_content_types: return html = requests.get(url).text readable_article = Document(html).summary() readable_article = self.text_cleanup(readable_article) if len(readable_article) > max_len: readable_article = readable_article[:max_len] + '...' readable_title = Document(html).title() page = MetadataParser(html=html) readable_description = page.get_metadata('description') if readable_description is None: readable_description = '' readable_description = self.text_cleanup(readable_description) description = '' if len(readable_description) > len(readable_article): description = readable_description else: description = readable_article if description: return "~> {}\n~> {}\n~> {}".format(url, readable_title, description) else: return "~> {}\n~> {}".format(url, readable_title)
def get_text_data(url): html = urlopen(url).read() from readability.readability import Document from bs4 import BeautifulSoup readable_article = Document(html).summary() readable_title = Document(html).title() soup = BeautifulSoup(readable_article) url_dict = {} url_dict['id'] = 1 url_dict['text'] = soup.text with open(text_file_path, 'w') as json_file: json.dump(url_dict, json_file)
def get_content(text): soup = BeautifulSoup(text, 'html.parser') article = soup.find(class_='post') if article: try: content = str(article) logging.info('find content in html tag') except: content = Document(text).summary() logging.info('conver soup to string error so via readability', exc_info=True) else: content = Document(text).summary() logging.info('find content via readability') return content
def tos(): """Render help/terms-of-use page.""" cleaned_up_content = Document(render_template('help/tos.html')).summary() response = dict(template='help/tos.html', content=cleaned_up_content, title='Help: Terms of Use') return handle_content_type(response)
def extract_entry_data(url): """ Fetch the full content for a feed entry url. Args: | url (str) -- the url of the entry. Returns: | entry_data -- Goose object. | str -- the full text, including html. """ html = _get_html(url) try: # Use Goose to extract data from the raw html, # Use readability to give us the html of the main document. # Some HTML comes with additional characters prior # to the actual document, so we want to strip everything up # to the first tag. html = html[html.index(b'<'):] return g.extract(raw_html=html), Document(html).summary() except UnicodeDecodeError as e: logger.exception('UnicodeDecodeError with html: {0}'.format(html)) return None, ''
def __init__(self, title=None, link=None, author=None): self.title = "None" if title == None else title self.link = "None" if link == None else link self.author = "None" if author == None else author if link == None: cleaned = "None" else: # get the content by parsing the link try: link_connect = urllib2.urlopen(link) self.link = clean_link(link_connect) html = link_connect.read() try: raw = nltk.clean_html(Document(html).summary()) except: raw = nltk.clean_html(html) cleaned = " ".join(re.split(r'[\n\r\t ]+', raw)) #The following unicode line raises exceptions sometimes. #The lack of a fix for now is causing some articles to not have any content #cleaned = unicode(cleaned, "utf-8") # TO DO : fix this cleaned.replace("&", "") except: cleaned = "None" #print "Length of cleaned HTML",len(cleaned) #print cleaned self.content = cleaned self.updatedAt = ""
def extract_content(): DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8' session = db_session(DB) M = db_model(DB, 'corpus') query = session.query(M) while True: corpuses = query.filter(M.status == 'ready').order_by(M.id).limit(30).all() if not corpuses: break for corpus in corpuses: try: summary_html = Document(corpus.html).summary(html_partial=True) content = BS(summary_html).text.strip() corpus.content = content session.commit() except: corpus.content = '[extract_error]' session.commit() print('===> extract_content error, id: ', corpus.id) corpus.status = 'extracted' session.commit()
def text(self): nonempty_path = self.article_path is not None and self.article_path if nonempty_path and os.path.exists(self.article_path): with open(self.article_path, 'r') as fio: result = fio.read() else: try: resp = requests.get(self.link) text = resp.text try: result = Document( text, min_text_length=50, positive_keywords=','.join( settings.DATASET_POSITIVE_KEYWORDS), negative_keywords=','.join( settings.DATASET_NEGATIVE_KEYWORDS)).summary() except Unparseable: result = text except (KeyError, requests.exceptions.RequestException, requests.exceptions.Timeout, requests.exceptions.TooManyRedirects) as e: result = '' self.article_path = os.path.join(settings.DATASET_ROOT, '{0}.html'.format(self.id)) with open(self.article_path, 'w') as fio: fio.write(result) self.save() return result
def reada(url, cache=True): if cache: cached = memcache.get(key=url) if cached is not None: return cached #file = urllib.urlopen(url) #import urllib2 opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] file = opener.open(url) ## enc = 'utf-8' text = '' try: # 1, web html 2 readability raw = Document(file.read(), url=url) html = raw.summary().encode(enc, 'replace') title = raw.short_title() # 2, readability 2 markdown, copy from main data = html.decode(enc) h = html2text.HTML2Text(baseurl=url) h.ignore_images = False h.body_width = 100000 text = h.handle(data) finally: file.close() d = {'url': url, 'title': title, 'content': text} if cache: memcache.add(key=url, value=d, time=600) return d
def parser_content(self, html, index_url): print self.targets if self.all_curl_num >= 1000: self.targets = [] if 'list' in index_url: return readable_article = Document(html).summary() push_time = self.parser_html_time(html) if not push_time or '2017-04-18' not in push_time: return try: title = re.findall('<h1.*?>(.+?)</h1>', html)[0] title = re.sub('<.+?>', '', title) except: print 'no-h1' + index_url return '' self.all_curl_num += 1 content_id = self._content_hash_id(readable_article) print content_id if not content_id: return print '*' * 100 cur = self.conn_status.cursor() sql = 'insert into news_status(url, published_time, title, source,content_id) VALUES (%s,%s,%s,%s,%s)' sql_arg = (index_url, push_time, title, self.source, content_id) print cur.mogrify(sql, sql_arg) # raw_input('go on') try: cur.execute(sql, sql_arg) print self.conn_status.commit() print index_url except Exception, e: print e self.conn_status.rollback()
def _parse_article(self, response): feed_entry = response.meta["feed_entry"] il = FeedEntryItemLoader(parent=response.meta["il"]) try: response.text except AttributeError: # Response is not text (e.g. PDF, ...). il.add_value("title", feed_entry.get("title")) il.add_value("content_html", feed_entry.get("summary")) return il.load_item() doc = Document(response.text, url=response.url) il.add_value("title", doc.short_title() or feed_entry.get("title")) summary = feed_entry.get("summary") try: content = doc.summary(html_partial=True) if summary and len(summary) > len(content): # Something probably went wrong if the extracted content is shorter than # the summary. raise Unparseable except Unparseable: content = summary il.add_value("content_html", content) return il.load_item()
def getArticle(url): """ Accepts a url and returns a string containing the article body of the url""" r = requests.get(url) r_content = r.content article = Document(r_content).summary() return Actions._cleanText(article)
def get_webpage_by_html(url, html=None): html = get_html_str(url, html) summary_obj = predefined_site(url, html) article = video_site(url) if summary_obj is None: doc = Document(html, url=url, debug=True, multipage=False) summary_obj = doc.summary_with_metadata(enclose_with_html_tag=False) title = summary_obj.short_title if article is None: article = summary_obj.html from urllib.parse import urlparse webpage = Webpage() webpage.url = url webpage.domain = urlparse(url).hostname webpage.title = title webpage.favicon = "" webpage.top_image = None webpage.excerpt = summary_obj.description webpage.author = None webpage.content = article webpage.tags = get_suggest_tags(title, article, summary_obj.keywords) webpage.movies = [] webpage.raw_html = html webpage.publish_date = None webpage.segmentation = get_segmentation(title, article) return webpage.__dict__