예제 #1
0
def url_matcher(event, url, *args, **kwargs):
    html = requests.get(url).text
    readable_article = Document(html).summary().encode("utf-8")
    readable_article = TAG_RE.sub('', readable_article)
    readable_article = WHITESPACE_RE.sub(' ', readable_article)
    readable_article = readable_article.replace('\n', ' ')
    readable_article = readable_article.replace('
', '')

    if len(readable_article) > 75:
        readable_article = readable_article[:75] + '...'

    readable_title = Document(html).short_title().encode("utf-8")

    return "> " + url + " > " + readable_title + " > " + readable_article
예제 #2
0
    def getTextFromHTML(self, url_id):
        """ Runs Readability (Document) on the HTML text
        """
        html_row = get_html(self.pg_conn, url_id)

        if not html_row or 'html' not in html_row:
            return False

        if html_row['readabletext'] and html_row['readabletext'] != '':
            return html_row['readabletext']

        html = html_row['html']

        try:
            html_summary = Document(html).summary(html_partial=True)
            html_summary = html_summary.replace('\n', '').replace('\t', '')

            if len(
                    html_summary
            ) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
                return False

            raw_text = lxml.html.document_fromstring(
                html_summary).text_content()
        except:
            raw_text = False

        if raw_text:
            save_readabletext(self.pg_conn, url_id, raw_text, 'meta')
        else:
            save_readabletext(self.pg_conn, url_id, '', 'meta')

        return raw_text
예제 #3
0
def getText():
    dataList = []
    for f in os.listdir('unsupervised\\documents'):
        filePath = 'unsupervised\\documents\\' + f
        #print filePath
        fileName, fileExtension = os.path.splitext(filePath)
        #print fileExtension
        if fileExtension.lower() == '.docx':
            print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
            doc = docxDocument(filePath)
            for p in doc.paragraphs:
                dataList.append(p.text)     #print p.text
            #print "-------------------------------"
        elif fileExtension.lower() == '.pdf':
            print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
            #TODO
        elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
            print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
            with codecs.open (filePath, errors='ignore') as myfile:
                source = myfile.read()
                article = Document(source).summary()
                title = Document(source).title()
                soup = BeautifulSoup(article, 'lxml')
                final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
                dataList.append(final)
                #print '*** TITLE *** \n\"' + title + '\"\n'
                #print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
        else:
            print '' # 'undectected document type'
            print '' #"-------------------------------"
    return dataList
예제 #4
0
 def fetch_article_contents(self):
     """
     Uses Readability.js + BS4 methods to parse raw html list and
     outputs list of text in an article
     """
     for article in self.raw_html:
         article = Document(article).summary()
         article = BeautifulSoup(article)
         [tag.extract() for tag in article.find_all('img')]
         [tag.extract() for tag in article.find_all('embed')]
         article = article.get_text()
         article = unicode(article)
         article = article.replace('\t', '')
         article = article.replace('\n', ' ')
         self.article_html.append(article)
     return self.article_html
    def getTextFromHTML(self, url_id):
        """ Runs Readability (Document) on the HTML text
        """
        html_row = get_html(self.pg_conn, url_id)

        if not html_row or 'html' not in html_row:
            return False

        if html_row['readabletext'] and html_row['readabletext'] != '':
            return html_row['readabletext']

        html = html_row['html']

        try:
            html_summary = Document(html).summary(html_partial=True)
            html_summary = html_summary.replace('\n','').replace('\t','')

            if len(html_summary) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
                return False

            raw_text = lxml.html.document_fromstring(html_summary).text_content()
        except:
            raw_text = False

        if raw_text:
            save_readabletext(self.pg_conn, url_id, raw_text, 'meta')
        else:
            save_readabletext(self.pg_conn, url_id, '', 'meta')

        return raw_text
예제 #6
0
파일: run.py 프로젝트: adamstein/mayhem
def main():
    #print 'Hello there'
    # Command line args are in sys.argv[1], sys.argv[2] ...
    # sys.argv[0] is the script name itself and can be ignored

    dataList = []

    for f in os.listdir('documents'):
        filePath = 'documents\\' + f
        #print filePath
        fileName, fileExtension = os.path.splitext(filePath)
        #print fileExtension
        if fileExtension.lower() == '.docx':
            print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
            doc = docxDocument(filePath)
            for p in doc.paragraphs:
                dataList.append(p.text)     #print p.text
            #print "-------------------------------"
        elif fileExtension.lower() == '.pdf':
            print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
            # with open(filePath) as f:
            #     doc = slate.PDF(f)
            #     print doc[1]
            #     exit()


            #TODO
        elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
            print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
            with codecs.open (filePath, errors='ignore') as myfile:
                source = myfile.read()
                article = Document(source).summary()
                title = Document(source).title()
                soup = BeautifulSoup(article, 'lxml')
                final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
                dataList.append(final)
                #print '*** TITLE *** \n\"' + title + '\"\n'
                #print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
        else:
            print '' # 'undectected document type'
            print '' #"-------------------------------"

    #print dataList
    #for i in dataList:
    #    print i
    cachedStopWords = stopwords.words("english")
    combined = ' '.join(dataList)

    #print combined
    bloblist = [tb(combined)]

    for i, blob in enumerate(bloblist):
        print("Top words in document {}".format(i + 1))
        scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')}
        #print scores
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        #print sorted_words
        for word, score in sorted_words:
            print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
예제 #7
0
def get_article (url, referrer=None):
    """Fetch the html found at url and use the readability algorithm
    to return just the text content"""

    html = load_url(url, referrer)
    if html is not None:
        doc_html = Document(html).summary(html_partial=True)
        clean_html = doc_html.replace('&amp;', u'&').replace(u'&#13;', u'\n')
        return BeautifulSoup(clean_html).getText(separator=u' ').replace(u'  ', u' ')
예제 #8
0
def get_article(url, referrer=None):
    """Fetch the html found at url and use the readability algorithm
    to return just the text content"""

    html = load_url(url, referrer)
    if html is not None:
        doc_html = Document(html).summary(html_partial=True)
        clean_html = doc_html.replace('&amp;', u'&').replace(u'&#13;', u'\n')
        return BeautifulSoup(clean_html).getText(separator=u' ').replace(
            u'  ', u' ')
예제 #9
0
def url_matcher(event, url, *args, **kwargs):
    r = requests.head(url)
    # files that are too big cause trouble. Let's just ignore them.
    if 'content-length' in r.headers and \
       int(r.headers['content-length']) > 5e6:
        return

    html = requests.get(url).text
    readable_article = Document(html).summary().encode("utf-8")
    readable_article = TAG_RE.sub('', readable_article)
    readable_article = WHITESPACE_RE.sub(' ', readable_article)
    readable_article = readable_article.replace('\n', ' ')
    readable_article = readable_article.replace('&#13;', '')

    if len(readable_article) > 75:
        readable_article = readable_article[:75] + '...'

    readable_title = Document(html).short_title().encode("utf-8")

    return "> " + url + " > " + readable_title + " > " + readable_article
예제 #10
0
def get_main_text(html):
    main_text = Document(html).summary()
    main_text = BeautifulSoup(main_text).getText()
    # 处理空行
    r = re.compile(r'\n+', re.M | re.S)
    main_text = r.sub('\n', main_text)
    # 去除首行回车
    if main_text.find('\n') == 0:
        main_text = main_text.replace('\n', '', 1)

    return main_text
예제 #11
0
    def getTextFromHTML(self, html):
        """ Runs Readability (Document) on the HTML text
        """
        try:
            html_summary = Document(html).summary(html_partial=True)
            html_summary = html_summary.replace('\n','').replace('\t','')
            if "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
                return False
            raw_text = lxml.html.document_fromstring(html_summary).text_content()
        except:
            raw_text = False

        return raw_text
예제 #12
0
 def get_content(self, url):
     rt_result = []
     dr = re.compile(r'<[^>]+>', re.S)
     html = urllib.urlopen(url).read()
     cur_title = Document(html).short_title().replace(' ', '')
     readable_article = Document(html).summary()
     print readable_article.encode('utf8')
     readable_article = readable_article.replace('&#13;', '')
     cur_list = readable_article.replace('</p>', '\n').split('\n')
     for item in cur_list:
         if '<img' in item and 'src=' in item:
             #print item.split('src=')[1].split('"')[1]
             dom = soupparser.fromstring(item)
             if len(dom) > 0:
                 img_path = dom[0].xpath('.//img')
                 for img in img_path:
                     rt_result.append(['0', img.get('src')])
         else:
             use_item = dr.sub('', item).replace(' ', '')
             if len(use_item) > 10:
                 rt_result.append(['1', use_item])
     return cur_title, rt_result
예제 #13
0
def extrat_html_document(url):
    try:
        print "extrat_html_document"
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = {'User-Agent': user_agent}
        r = urllib2.Request(url, headers=headers)
        socket = urllib2.urlopen(r, timeout=1)
        url = socket.geturl()
        html = socket.read()

        #block_url pass
        for bl_url in block_url:
            if len(url.split(bl_url)) > 1:
                summary = "block"
                return summary

        for ext_url in exception_url:
            if len(url.split(ext_url)) > 1:
                readable_title = Document(html).short_title()
                summary = readable_title.encode('utf-8')
                _file.write(summary + '\n')
                return summary

        readable_article = Document(html).summary()
        readable_title = Document(html).short_title()
        summary = readable_title.encode('utf-8') + readable_title.encode(
            'utf-8')
        print "soup start"
        soup = BeautifulSoup(readable_article.replace("br/", "p"),
                             "html.parser")
        print "summary:"

        for s in soup("p"):
            summary += str(s.encode('utf-8'))


#        summary += readable_article.encode('utf-8')

    except Exception:
        _file.write('extrat_html_document Failed URL : ' + url + '\n')
        summary = "Failed Get data"

    return summary
예제 #14
0
def extrat_html_document(url):
    try :
        print "extrat_html_document"
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        r = urllib2.Request(url, headers=headers)
        socket = urllib2.urlopen(r,timeout = 1)
        url = socket.geturl()
        html = socket.read()

        #block_url pass
        for bl_url in block_url:
            if len(url.split(bl_url)) > 1:
                summary="block"
                return summary

        for ext_url in exception_url:
            if len(url.split(ext_url)) > 1:
                readable_title = Document(html).short_title()
                summary = readable_title.encode('utf-8')
                _file.write(summary+'\n')
                return summary

        readable_article = Document(html).summary()
        readable_title = Document(html).short_title()
        summary = readable_title.encode('utf-8') + readable_title.encode('utf-8')
        print "soup start"
        soup = BeautifulSoup(readable_article.replace("br/","p"),"html.parser")
        print "summary:"

        for s in soup("p"):
            summary += str(s.encode('utf-8'))

#        summary += readable_article.encode('utf-8')


    except Exception:
        _file.write('extrat_html_document Failed URL : ' + url + '\n')
        summary = "Failed Get data"

    return summary
예제 #15
0
def parser_content(url):
    rt_result = []
    dr = re.compile(r'<[^>]+>',re.S)
    html = urllib.urlopen(url).read()
    readable_article = Document(html).summary().encode('utf8')
    #print readable_article
    readable_article = readable_article.replace('&#13;','')
    cur_list = readable_article.split('\n')
    for item in cur_list:
        if '<img' in item and 'src=' in item:
            #print item.split('src=')[1].split('"')[1]
            dom = soupparser.fromstring(item)
            if len(dom) > 0:
                img_path = dom[0].xpath('.//img')
                for img in img_path:
                    rt_result.append(['0',img.get('src')])
        else:
            use_item = dr.sub('',item).replace(' ','')
            if len(use_item) > 10:
                rt_result.append(['1',use_item])
    return rt_result
예제 #16
0
    def parse_item(self, response):
        sel = Selector(response)
        try:
            print(response.url)
            #region title
            if sel.css("h1#title::text").extract_first().strip():
                title = sel.css("h1#title::text").extract_first().strip()
            elif sel.xpath("//title/text()").extract_first().strip():
                title = sel.xpath("//title/text()").extract_first().strip()
            else:
                title = ""
            #endregion
            #region publish_data
            if sel.css("span#pubtime::text").re_first(r"\d{4}年\d{2}月\d{2}日"):
                publish_data = sel.css("span#pubtime::text").re_first(
                    "\d{4}年\d{2}月\d{2}日")
            else:
                publish_data = ""
            # endregion
            #region reference
            if sel.css("span#pubtime::text").re_first(r"来源:(.*)"):
                reference = sel.css("span#pubtime::text").re_first(r"来源:(.*)")
            else:
                reference = ""
            #endregion
            #region keywords
            if sel.xpath(
                    "//div[@class='zuoyou0']/div[5]/font/text()").extract():
                keywords = ",".join(
                    sel.xpath("//div[@class='zuoyou0']/div[5]/font/text()").
                    extract())
            elif sel.xpath(
                    "//div[@class='zuoyou0']/div[4]/font/text()").extract():
                keywords = ",".join(
                    sel.xpath("//div[@class='zuoyou0']/div[4]/font/text()").
                    extract())
            elif sel.xpath(
                    '//meta[@name="keywords"]/@content').extract_first():
                keywords = ",".join(
                    sel.xpath(
                        '//meta[@name="keywords"]/@content').extract_first())
            else:
                keywords = ''
            #endregion
            #region html_content
            if sel.xpath("//div[@class='duiqi']/p/font/text()").extract():
                html_content = "".join(
                    sel.xpath("//div[@class='duiqi'][2]/p").extract())
            else:
                html_content = Document(response.text).summary()
                html_content = html_content.replace('<html><body>',
                                                    '').replace(
                                                        '<html><body>', '')
            content = "".join(
                Selector(text=html_content).css("::text").extract())

            #endregion
            #region img_url
            if Selector(text=html_content).css("img::attr(src)").extract():
                img = Selector(
                    text=html_content).css("img::attr(src)").extract_first
            else:
                img = ''

            #endregion

            # region item
            i = ItemLoader(item=NewsItem(), response=response)
            if (title):
                i.add_value(field_name='title', value=title)
                i.add_value(field_name='publish_date', value=publish_data)
                i.add_value(field_name="reference", value=reference)
                # i.add_value(field_name="author", value=authon)
                i.add_value(field_name="keywords", value=keywords)
                i.add_value(field_name="html_content", value=html_content)
                i.add_value(field_name="image_url", value=img)
                # 补全代码
                # 基类

                yield i.load_item()
            # endregion
        except Exception as e:
            print(e.args)
예제 #17
0
def get_cleaned_html_from_url(url):
    readable_article = Document(get_html(url)).summary()
    readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
    string_out = "<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>"
    string_out += readable_article[6:]
    return string_out
예제 #18
0
def write_readable_text_from_url(url,out_file):
    readable_article = Document(get_html(url)).summary()
    readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
    out_file.write("<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>")
    out_file.write(readable_article[6:])
예제 #19
0
def update(offset):
    offset = int(offset)
    if offset == 0:
        db.session.query(Entry).delete()
        db.session.commit()
        return ''

    # Obtain bearer token from Twitter
    url = "https://api.twitter.com/oauth2/token"
    consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
    consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET')
    auth = base64.b64encode(consumer_key + ':' + consumer_secret)
    request = urllib2.Request(url, "grant_type=client_credentials", {"Authorization": "Basic "+auth})
    response = urllib2.urlopen(request).read()
    json_response = json.loads(response)
    access_token = json_response['access_token']

    # Obtain HN posts >100 pts
    url = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=newsyc100&count=40"
    request = urllib2.Request(url, headers={"Authorization": "Bearer "+access_token})
    response = urllib2.urlopen(request).read()
    tweets = json.loads(response)

    increment = 2

    start_at = (offset - 1) * increment
    tweets = tweets[start_at:start_at + increment]
    for tweet in tweets:
        title = tweet['text']

        start_link = title.rfind("(http")
        end_link = title.find(")", start_link)
        comment_link = title[start_link+1:end_link]

        title = title[0:start_link]

        start_link = title.rfind("http")
        end_link = title.find(" ", start_link)
        link = title[start_link:end_link]

        title = title[0:start_link]

        try:
            response = urllib2.urlopen(link)
        except urllib2.HTTPError:
            continue

        encoding = response.headers['content-type'].split('charset=')[-1]
        if encoding == 'text/html':
            encoding = 'utf-8'
        if encoding == 'application/pdf':
            continue
        html = response.read().decode(encoding, 'ignore')

        if sys.modules.has_key('readability.readability'):
            body = Document(html).summary()
        else:
            body = html

        body = body.replace('<html><body>', '<html><body><a href="' + comment_link + '">HN Comments</a><br>')
        body = body.replace('<body id="readabilityBody">', '')

        entry = Entry(link, title, body)
        db.session.add(entry)

    db.session.commit()
    return ''
예제 #20
0
import requests
from readability.readability import Document

url_in = "" # put url here

r = requests.get(url_in)
# print(r.status_code)

html = r.text

with open('out.html','w') as out_file:
    readable_article = Document(html).summary()
    readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
    out_file.write("<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>")
    out_file.write(readable_article[6:])