Exemplo n.º 1
0
 def loadFromWeb(cls,url):   
     html = requests.get(url).content
     readable_article = Document(html).summary()
     readable_title = Document(html).short_title()
     cleantext = BeautifulSoup(readable_article).text
     cleantext = HTMLParser.HTMLParser().unescape(cleantext)
     return cleantext
Exemplo n.º 2
0
def extract(text):
    soup = BeautifulSoup(text, 'html.parser')  # , from_encoding="utf8")
    aaa = soup.find('li', {'id': 'EntryTag'})
    print aaa
    bbb = soup.find('div', {'id': 'BlogPostCategory'})
    tag_str = ''
    print bbb
    soup1 = soup.find('div', {'id': 'cnblogs_post_body'})
    if soup1:
        try:
            content = str(soup1)
            logging.info('find content in html tag')
        except:
            content = Document(text).summary()
            logging.info('conver soup to string error so via readability',
                         exc_info=True)
    else:
        content = Document(text).summary()
        logging.info('find content via readability')
    try:
        aaaa = aaa.find_all('a')
        tag_list = [i2.get_text for i2 in aaaa]
        tag_str = ','.join(tag_list)
        aaab = bbb.find_all('a')
        tag_list2 = [i2.get_text for i2 in aaab]
        tag_str += ','.join(tag_list2)
    except Exception, e:
        # print Exception, e
        logging.error('cant find keyword in html', exc_info=True)
Exemplo n.º 3
0
def contents_scraping(link, remove_space=True, remove_lb=True):
    """Scraping contents.

    Parameter
    ---------
    url : str
      Scraping target url.

    Return
    ------
    list : 
        title and contents.
    """

    try:
        html = urllib.request.urlopen(link).read()
    except:
        print("ERROR : failed to get contents. -> " + link)
        return (False, "")

    title = Document(html).short_title()
    contents = Document(html).summary()
    contents = html2text.html2text(contents)

    p = re.compile(r"<[^>]*?>")
    c = p.sub("", contents)

    if remove_space is True:
        c = c.replace(" ", "")

    if remove_lb is True:
        c = c.replace("\r", "")
        c = c.replace("\n", "")

    return title, c
Exemplo n.º 4
0
    def getTitleAndContent(self, contentUrl):
        myHeader = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:55.0) Gecko/20100101 Firefox/55.0',
        }
        try:
            r = self.http.request('GET', contentUrl, headers=myHeader)
            # print(r.status)  # 200
            # 获得html源码,utf-8解码
            # print(r.data.decode())
            html = r.data
            readable_tilte = Document(html).short_title()
            readable_article = Document(html).summary()
            content = self.ht.handle(readable_article)
            # content = re.sub(r'阅读剩余全文()|该菜谱创建于[\s\S]+任何部分的内容。|(更多相关资讯请关注:|用手机访问|1[\s\d]+\s下一页|\*\s|精美图片)[\s\S]+|(新闻热线:[\s\S]+)#', '', content)

            response = etree.HTML(html)
            # content = response.xpath("string(//div[@class='text-3zQ3cZD4'])")
            # content = re.sub(
            #     r'图集|(\+1\s|【纠错】)[\s\S]+', '',
            #     content).strip()
            # script = response.xpath("//script")[5].text
            # response = re.findall('contentList":([\s\S]+),"currentPage', script)[0]
            # datas = json.loads(response)[0]
            # strData = datas['data']

            # pat = re.compile('<[^>]+>', re.S)
            # content = pat.sub('', strData)
            # content = ''.join(content).replace(u'\u3000', '').replace(u'\xa0','').strip()
            data = dict()
            data["title"] = readable_tilte
            data["content"] = content

            return self.return_data(0, "success", data)
        except Exception as e:
            return self.return_data(1, e)
Exemplo n.º 5
0
def getContent(url):

    print '@@ start crawl %s @@@' % url

    html = getHTml(url)
    '''readability介入分析'''
    readable_article = Document(html).summary()
    readable_title = Document(html).short_title()

    a = re.sub(r'<script[\s\S]*?</script>|&#13;', '', readable_article).strip()
    b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>', '', a).strip()
    c = re.sub(r'<p[^>]*?>', '<p>', b).strip().replace('\n', '')
    d = re.sub(r'<p>\s+<p>', '', c)

    # 统计中文字数
    num = number(b)

    if num > 100:

        #sql = '''INSERT INTO newbaidu_detail_contont VALUES ('%s','%s','%s','%s')''' % (url,readable_title,d,current_date)
        getc = url + '\n' + readable_title + '\n' + d + '\n' + current_date + '\n'

        try:
            with open('news/' + readable_title + '.txt', 'w') as f2:
                f2.write(getc)
            print '执行成功'
        except Exception, e:
            print '执行失败,%s' % e

        return '成功'
Exemplo n.º 6
0
    def get_screen_play(self, url):
        """Download webpage and analyze basic sequence

        :param url:
        :return:
        """
        res = requests.get(url)
        html = res.content.decode('utf-8')
        # Analyze basic sequence
        readable_article = Document(html).summary()
        self.readable_article = readable_article
        readable_title = Document(html).title()
        self.readable_title = readable_title

        base_url = path.dirname(res.request.url)

        result = Extractor(base_url).html_to_asset_list(readable_article)
        #print(result)
        df_screenplay = pd.DataFrame(result, columns=['type', 'content'])
        df_screenplay['local_src'] = df_screenplay['content'].apply(lambda x: self.string2hash(x))
        image_selector = (df_screenplay['type'] == 'image')
        df_screenplay.loc[image_selector, 'filename'] = df_screenplay.loc[
            image_selector, 'content'].apply(lambda x: path.basename(x))
        df_screenplay.loc[image_selector, 'extname'] = df_screenplay.loc[
            image_selector, 'filename'].apply(lambda x: path.splitext(x)[1])
        df_screenplay = df_screenplay.fillna('')
        df_screenplay['download_name'] = df_screenplay['local_src'] + df_screenplay['extname']
        df_screenplay['converted_name'] = df_screenplay['local_src'] + '.png'

        self.df_screenplay = df_screenplay
        return df_screenplay
def crawl(site, depth, linksfile):
    pattern = re.compile(r'href="(http://.*?)"')
    f = open(linksfile, 'a+')
    try:
        if depth < MAX_DEPTH:
            print 'crawling [%s]...' % site,
            print >> f, '[%s]' % site

            br = mechanize.Browser()
            br.set_handle_robots(False)
            br.addheaders = [('User-agent', 'Firefox')]
            url = br.open(site)
            content = url.read()

            hits = pattern.findall(content)
            for hit in hits:
                print >> f, hit
                url2 = br.open(hit)
                content2 = url.read()
                readable_article = Document(content2).summary()
                readable_title = Document(content).short_title()
                soup = BeautifulSoup(readable_article)
                final_article = soup.text
                links = soup.findAll('img', src=True)
                print final_article

            print 'done.'
            print >> f, ''

            for hit in hits:

                crawl(hit, depth + 1, linksfile)
    except:
        pass
    f.close()
Exemplo n.º 8
0
def get_main_content(html):
    readable_title = Document(html).short_title()
    readable_article = Document(html).summary()
    text_p = re.sub(r'</?div.*?>', '', readable_article)
    text_p = re.sub(r'((</p>)?<a href=.*?>|</a>(<p>)?)', '', text_p)
    text_p = re.sub(r'<select>.*?</select>', '', text_p)

    return readable_title, text_p
def getReadability(url):
    #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html'
    try:
        html = urllib.urlopen(url).read()
        readable_article = Document(html).summary().replace('\n', '')
        readable_title = Document(html).short_title()
        return readable_title, readable_article
    except Exception, e:
        return '', ''
Exemplo n.º 10
0
def scrape(url,
           pdf_filename,
           pdf_page_size=PDF_PAGE_SIZE,
           folder=OUTPUT_FOLDER,
           clean_it=True,
           css_file=EPUB_CSS,
           lang=EPUB_LANG,
           cover_image=EPUB_COVER,
           isbn=None):
    """Fetch the html content at url and convert it to a pdf file,
    cleaned by readability and framed in an easy-to-read format if
    clean_it is True"""

    raw_html = get_url(url)
    if raw_html is None:
        print "Sorry, could not read ", url
    else:
        filename_prefix, file_ext = os.path.splitext(pdf_filename)
        if clean_it:
            # use readability to get rid of crap
            title = Document(raw_html).short_title()
            content = Document(raw_html).summary(html_partial=True)

            # write the cleaned contents to an html frame for pdf conversion
            frame = HTML_FRAME.substitute(content=to_unicode(content),
                                          url=url,
                                          title=title)

            # unlike pdf, epub is controlled by css, so save the cleaned html alone
            epub_source = write_file(
                folder, os.extsep.join([filename_prefix + '_epub', 'html']),
                to_unicode(content))
            pdf_source = write_file(folder,
                                    os.extsep.join([filename_prefix, 'html']),
                                    frame)

        else:
            title = filename_prefix
            # no readability cleaning requested, so use the fetched html as-is
            epub_source = write_file(
                folder, os.extsep.join([filename_prefix + '_epub', 'html']),
                to_unicode(raw_html))
            pdf_source = write_file(folder,
                                    os.extsep.join([filename_prefix, 'html']),
                                    to_unicode(raw_html))

        if epub_source:
            generate_epub(
                folder, filename_prefix, title,
                os.path.join(
                    folder, os.extsep.join([filename_prefix + '_epub',
                                            'html'])), css_file, cover_image,
                lang, isbn)

        if pdf_source:
            generate_pdf(folder, filename_prefix, pdf_page_size)
Exemplo n.º 11
0
def fetch_url(url):
    '''
    get url with readability
    '''
    html = basic_fetch_url(url)
    readable_article = Document(html).summary()
    title = Document(html).short_title()
    text = BeautifulSoup(readable_article).get_text()

    return title, text
Exemplo n.º 12
0
    def parse_post(self, response):
        #def parse(self, response):

        dom = PyQuery(response.body)
        res = []
        item = AvnpcPostItem()
        item['title'] = Document(response.body).summary()
        item['url'] = response.url
        item['content'] = Document(response.body).summary()
        return [item]
Exemplo n.º 13
0
def textualize(path):
    """
    Opens an HTML file on disk and cleans up the tags to get the text
    """
    with codecs.open(path, 'r', 'utf8') as f:
        html = f.read()
        article = Document(html).summary()
        title = Document(html).title()
        soup = BeautifulSoup(article)

        return title, soup.text
Exemplo n.º 14
0
def run1():
    db = MySQLdb.connect(**common.sql_config)
    cursor = db.cursor(MySQLdb.cursors.SSCursor)
    sql_1 = """select id, url, content from news """
    cursor.execute(sql_1)
    print cursor.rowcount
    i = 0
    row = True
    row = cursor.fetchone()

    while row is not None:
        i += 1
        if i % 100 == 0:
            print i, 666666666666666
        row = cursor.fetchmany(size=500)
        # print row
        for row_id, url, content in row:
            # print row_id
            if comb(content, 250) and 'v2ex.com' not in url:
                # print content, 111111111111111111111
                r = common.get_request(url)
                if r.url.startswith('http://mp.weixin.qq.com/'):
                    soup2 = BeautifulSoup(r.text, 'html.parser')
                    title = soup2.find('title').get_text().encode('utf8')
                    content = soup2.find('div',
                                         {'class': 'rich_media_content'})
                    content = unicode(content).encode('utf8')
                else:
                    content = Document(r.text.encode(
                        r.encoding, 'ignore')).summary().encode('utf-8')
                    title = Document(r.text.encode(
                        r.encoding)).short_title().encode('utf-8')
                db2 = MySQLdb.connect(**common.sql_config)
                cursor2 = db2.cursor()
                if not comb(content, 250) and 'mp.weixin.qq.com' in url:
                    sql = """update news set rating = 0, content = '{}' where id = '{}'""".format(
                        db2.escape_string(content), row_id)
                    print 2222222222
                else:
                    sql = """update news set rating = -1, content = '{}' where id = '{}' """.format(
                        db2.escape_string(content), row_id)
                try:
                    cursor2.execute(sql)
                    db2.commit()
                except Exception, e:
                    print e
                    db2.rollback()
                db.ping(True)
                db2.close()
                print row_id, 777777777777777777777
                print url
Exemplo n.º 15
0
def url_matcher(event, url, *args, **kwargs):
    html = requests.get(url).text
    readable_article = Document(html).summary().encode("utf-8")
    readable_article = TAG_RE.sub('', readable_article)
    readable_article = WHITESPACE_RE.sub(' ', readable_article)
    readable_article = readable_article.replace('\n', ' ')
    readable_article = readable_article.replace('&#13;', '')

    if len(readable_article) > 75:
        readable_article = readable_article[:75] + '...'

    readable_title = Document(html).short_title().encode("utf-8")

    return "> " + url + " > " + readable_title + " > " + readable_article
Exemplo n.º 16
0
def cleanHtmlToText(html):
    '''clean html and return list of words to make it brainspeed readable'''
    title = Document(html).short_title()
    html = Document(html).summary()
    soup = BeautifulSoup(html)

    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    text = soup.get_text()

    dicText = {'text': text, 'title': title}

    return dicText
Exemplo n.º 17
0
def get_article(url):
    try:
        html = urllib.urlopen(url).read()
        if html:
            readable_article = Document(html).summary().encode(
                'utf-8', 'ignore')
            readable_title = Document(html).short_title().encode(
                'utf-8', 'ignore')
            return readable_title, readable_article
    except EOFError:
        print 'Error fetching %s: %s' % (url, EOFError)
    except Exception as e:
        print 'Error fetching %s: %s' % (url, e)
    return None, None
Exemplo n.º 18
0
    def url_matcher(self, msg, match):
        url = match.group(0)
        r = requests.head(url)
        max_size = self.config['DOC_MAX_SIZE']
        max_len = self.config['DOC_MAX_LEN']

        # files that are too big cause trouble. Let's just ignore them.
        if 'content-length' in r.headers and \
           int(r.headers['content-length']) > max_size:
            return

        # ignore anything that is not allowed in configuration
        allowed_content_types = self.config['ALLOWED_CONTENT_TYPES']
        content_type = ''
        if 'content-type' in r.headers:
            content_type = re.sub(r'\s*\;.*$', '', r.headers['content-type'])
            content_type = content_type.strip()

        if content_type not in allowed_content_types:
            return

        html = requests.get(url).text
        readable_article = Document(html).summary()
        readable_article = self.text_cleanup(readable_article)

        if len(readable_article) > max_len:
            readable_article = readable_article[:max_len] + '...'

        readable_title = Document(html).title()

        page = MetadataParser(html=html)
        readable_description = page.get_metadata('description')

        if readable_description is None:
            readable_description = ''

        readable_description = self.text_cleanup(readable_description)

        description = ''
        if len(readable_description) > len(readable_article):
            description = readable_description
        else:
            description = readable_article

        if description:
            return "~> {}\n~> {}\n~> {}".format(url, readable_title,
                                                description)
        else:
            return "~> {}\n~> {}".format(url, readable_title)
def get_text_data(url):
    html = urlopen(url).read()

    from readability.readability import Document
    from bs4 import BeautifulSoup

    readable_article = Document(html).summary()
    readable_title = Document(html).title()
    soup = BeautifulSoup(readable_article)
    url_dict = {}
    url_dict['id'] = 1
    url_dict['text'] = soup.text

    with open(text_file_path, 'w') as json_file:
        json.dump(url_dict, json_file)
Exemplo n.º 20
0
def get_content(text):
    soup = BeautifulSoup(text, 'html.parser')
    article = soup.find(class_='post')
    if article:
        try:
            content = str(article)
            logging.info('find content in html tag')
        except:
            content = Document(text).summary()
            logging.info('conver soup to string error so via readability',
                         exc_info=True)
    else:
        content = Document(text).summary()
        logging.info('find content via readability')
    return content
Exemplo n.º 21
0
def tos():
    """Render help/terms-of-use page."""
    cleaned_up_content = Document(render_template('help/tos.html')).summary()
    response = dict(template='help/tos.html',
                    content=cleaned_up_content,
                    title='Help: Terms of Use')
    return handle_content_type(response)
Exemplo n.º 22
0
def extract_entry_data(url):
    """
    Fetch the full content for a feed entry url.

    Args:
        | url (str)    -- the url of the entry.

    Returns:
        | entry_data -- Goose object.
        | str        -- the full text, including html.
    """

    html = _get_html(url)

    try:
        # Use Goose to extract data from the raw html,
        # Use readability to give us the html of the main document.

        # Some HTML comes with additional characters prior
        # to the actual document, so we want to strip everything up
        # to the first tag.
        html = html[html.index(b'<'):]

        return g.extract(raw_html=html), Document(html).summary()

    except UnicodeDecodeError as e:
        logger.exception('UnicodeDecodeError with html: {0}'.format(html))
        return None, ''
Exemplo n.º 23
0
    def __init__(self, title=None, link=None, author=None):

        self.title = "None" if title == None else title
        self.link = "None" if link == None else link
        self.author = "None" if author == None else author

        if link == None:
            cleaned = "None"
        else:  # get the content by parsing the link
            try:
                link_connect = urllib2.urlopen(link)
                self.link = clean_link(link_connect)
                html = link_connect.read()
                try:
                    raw = nltk.clean_html(Document(html).summary())
                except:
                    raw = nltk.clean_html(html)
                cleaned = " ".join(re.split(r'[\n\r\t ]+', raw))
                #The following unicode line raises exceptions sometimes.
                #The lack of a fix for now is causing some articles to not have any content
                #cleaned = unicode(cleaned, "utf-8") # TO DO : fix this
                cleaned.replace("&", "")
            except:
                cleaned = "None"

        #print "Length of cleaned HTML",len(cleaned)
        #print cleaned
        self.content = cleaned
        self.updatedAt = ""
def extract_content():

    DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8'

    session = db_session(DB)
    M = db_model(DB, 'corpus')

    query = session.query(M)

    while True:
        corpuses = query.filter(M.status == 'ready').order_by(M.id).limit(30).all()
        if not corpuses:
            break

        for corpus in corpuses:
            try:
                summary_html = Document(corpus.html).summary(html_partial=True)
                content = BS(summary_html).text.strip()
                corpus.content = content
                session.commit()
            except:
                corpus.content = '[extract_error]'
                session.commit()
                print('===> extract_content error, id: ', corpus.id)

            corpus.status = 'extracted'
            session.commit()
Exemplo n.º 25
0
 def text(self):
     nonempty_path = self.article_path is not None and self.article_path
     if nonempty_path and os.path.exists(self.article_path):
         with open(self.article_path, 'r') as fio:
             result = fio.read()
     else:
         try:
             resp = requests.get(self.link)
             text = resp.text
             try:
                 result = Document(
                     text,
                     min_text_length=50,
                     positive_keywords=','.join(
                         settings.DATASET_POSITIVE_KEYWORDS),
                     negative_keywords=','.join(
                         settings.DATASET_NEGATIVE_KEYWORDS)).summary()
             except Unparseable:
                 result = text
         except (KeyError, requests.exceptions.RequestException,
                 requests.exceptions.Timeout,
                 requests.exceptions.TooManyRedirects) as e:
             result = ''
         self.article_path = os.path.join(settings.DATASET_ROOT,
                                          '{0}.html'.format(self.id))
         with open(self.article_path, 'w') as fio:
             fio.write(result)
         self.save()
     return result
Exemplo n.º 26
0
def reada(url, cache=True):

	if cache:
		cached = memcache.get(key=url)
		if cached is not None:
			return cached

	#file = urllib.urlopen(url)
        #import urllib2
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        file = opener.open(url)
        ##
	enc = 'utf-8'
	text = ''
	try:
		# 1, web html 2 readability
		raw = Document(file.read(), url=url)
		html = raw.summary().encode(enc, 'replace')
		title = raw.short_title()

		# 2, readability 2 markdown, copy from main
		data = html.decode(enc)
		h = html2text.HTML2Text(baseurl=url)
		h.ignore_images = False
		h.body_width = 100000
		text = h.handle(data)
	finally:
		file.close()

	d = {'url': url, 'title': title, 'content': text}
	if cache:
		memcache.add(key=url, value=d, time=600)
	return d
Exemplo n.º 27
0
 def parser_content(self, html, index_url):
     print self.targets
     if self.all_curl_num >= 1000:
         self.targets = []
     if 'list' in index_url:
         return
     readable_article = Document(html).summary()
     push_time = self.parser_html_time(html)
     if not push_time or '2017-04-18' not in push_time:
         return
     try:
         title = re.findall('<h1.*?>(.+?)</h1>', html)[0]
         title = re.sub('<.+?>', '', title)
     except:
         print 'no-h1' + index_url
         return ''
     self.all_curl_num += 1
     content_id = self._content_hash_id(readable_article)
     print content_id
     if not content_id:
         return
     print '*' * 100
     cur = self.conn_status.cursor()
     sql = 'insert into news_status(url, published_time, title, source,content_id) VALUES (%s,%s,%s,%s,%s)'
     sql_arg = (index_url, push_time, title, self.source, content_id)
     print cur.mogrify(sql, sql_arg)
     # raw_input('go on')
     try:
         cur.execute(sql, sql_arg)
         print self.conn_status.commit()
         print index_url
     except Exception, e:
         print e
         self.conn_status.rollback()
Exemplo n.º 28
0
    def _parse_article(self, response):
        feed_entry = response.meta["feed_entry"]

        il = FeedEntryItemLoader(parent=response.meta["il"])
        try:
            response.text
        except AttributeError:
            # Response is not text (e.g. PDF, ...).
            il.add_value("title", feed_entry.get("title"))
            il.add_value("content_html", feed_entry.get("summary"))
            return il.load_item()

        doc = Document(response.text, url=response.url)
        il.add_value("title", doc.short_title() or feed_entry.get("title"))
        summary = feed_entry.get("summary")
        try:
            content = doc.summary(html_partial=True)
            if summary and len(summary) > len(content):
                # Something probably went wrong if the extracted content is shorter than
                # the summary.
                raise Unparseable
        except Unparseable:
            content = summary
        il.add_value("content_html", content)

        return il.load_item()
Exemplo n.º 29
0
 def getArticle(url):
     """ Accepts a url and returns a string 
     containing the article body of the url"""
     r = requests.get(url)
     r_content = r.content
     article = Document(r_content).summary()
     return Actions._cleanText(article)
Exemplo n.º 30
0
def get_webpage_by_html(url, html=None):
    html = get_html_str(url, html)
    summary_obj = predefined_site(url, html)
    article = video_site(url)
    if summary_obj is None:
        doc = Document(html, url=url, debug=True, multipage=False)
        summary_obj = doc.summary_with_metadata(enclose_with_html_tag=False)
    title = summary_obj.short_title
    if article is None:
        article = summary_obj.html
    from urllib.parse import urlparse
    webpage = Webpage()
    webpage.url = url
    webpage.domain = urlparse(url).hostname
    webpage.title = title
    webpage.favicon = ""
    webpage.top_image = None
    webpage.excerpt = summary_obj.description
    webpage.author = None
    webpage.content = article
    webpage.tags = get_suggest_tags(title, article, summary_obj.keywords)
    webpage.movies = []
    webpage.raw_html = html
    webpage.publish_date = None
    webpage.segmentation = get_segmentation(title, article)
    return webpage.__dict__