Пример #1
0
def find_full_text(html_source):
    doc = Doc(html_source)
    content = doc.summary()

    stripped = strip_tags(content)
    stripped = to_plain_text(stripped)
    return stripped
Пример #2
0
def markdownify(url_list, **options):
    articles = []
    images = []
    paragraph_links = options['paragraph_links']
    wrap_text = options['wrap_text']
    preamble = options['preamble']
    for url in url_list:
        req = urllib2.Request(url,None,{'Referer': url_list[0]})
        html = urllib2.urlopen(req).read()
        document = Document(html, url=url)
        readable_title = document.short_title()
        summary = document.summary()
        summary_doc = build_doc(summary)
        images.extend([a.get('src') for a in summary_doc.findall('.//img')])
        articles.append(document.summary())

    markdown_articles = []
    for (article, url) in zip(articles, url_list):
        h = html2text.HTML2Text(baseurl=url)
        h.inline_links = False
        h.links_each_paragraph = (paragraph_links and 1) or 0
        h.body_width = (wrap_text and 78) or 0
        markdown_articles.append(h.handle(article))
    combined_article = u"\n\n----\n\n".join(markdown_articles)
    if preamble:
        combined_article = (u"Title:        %s  \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article
    return combined_article.encode("utf-8")
Пример #3
0
def run(index):
	print "Index %d" % index
	dirname = "data/%04d" % index

	# url of english article
	url = open(dirname + "/url_en.txt").read()

	# download html
	html = urllib.urlopen(url).read().decode('latin-1')

	# apply readability
	document = Document(html)
	article = document.summary()
	article = nltk.clean_html(article)

	# replace latin characters
	article = re.sub(u'
', u'\n', article)
	article = re.sub(u'\x92', u'`', article)
	article = re.sub(u'\x96', u'-', article)

	# article_en.txt
	output = codecs.open(dirname + "/article_en.txt", 'w', encoding='ascii', errors='ignore')
	output.write(article)
	output.close()

	# title.txt
	output = codecs.open(dirname + "/title.txt", 'w', encoding='ascii', errors='ignore')
	output.write(document.title())
	output.close()
Пример #4
0
def extract_article(url):
    r = requests.get(url)

    # the the url exists, continue
    if r.status_code == 200:

        # extract and parse response url
        url = parse_url(r.url)

        # extract html
        html = r.content.decode('utf-8', errors='ignore')

        # run boilerpipe
        # boilerpipe_extractor = Extractor(html=html)

        # run readability
        readability_extractor = Document(html)

        html = readability_extractor.summary()
        # return article data
        return {
            'title': readability_extractor.short_title(),
            'html': html,
            'content': strip_tags(html).encode('utf-8', errors='ignore'),
            'url': url
        }

    # otherwise return an empty dict
    else:
        return {}
Пример #5
0
    def get_screen_play(self, url):
        """Download webpage and analyze basic sequence

        :param url:
        :return:
        """
        res = requests.get(url)
        html = res.content.decode('utf-8')
        # Analyze basic sequence
        readable_article = Document(html).summary()
        self.readable_article = readable_article
        readable_title = Document(html).title()
        self.readable_title = readable_title

        base_url = path.dirname(res.request.url)

        result = Extractor(base_url).html_to_asset_list(readable_article)
        #print(result)
        df_screenplay = pd.DataFrame(result, columns=['type', 'content'])
        df_screenplay['local_src'] = df_screenplay['content'].apply(lambda x: self.string2hash(x))
        image_selector = (df_screenplay['type'] == 'image')
        df_screenplay.loc[image_selector, 'filename'] = df_screenplay.loc[
            image_selector, 'content'].apply(lambda x: path.basename(x))
        df_screenplay.loc[image_selector, 'extname'] = df_screenplay.loc[
            image_selector, 'filename'].apply(lambda x: path.splitext(x)[1])
        df_screenplay = df_screenplay.fillna('')
        df_screenplay['download_name'] = df_screenplay['local_src'] + df_screenplay['extname']
        df_screenplay['converted_name'] = df_screenplay['local_src'] + '.png'

        self.df_screenplay = df_screenplay
        return df_screenplay
Пример #6
0
 def _update(self, response):
     data = Document(response.text).summary()
     doc = lxml.html.fromstring(data)
     images = []
     for img in doc.xpath("//img"):
         src = urlparse.urljoin(response.url, img.get("src"))
         imgResp = requests.get(src)
         encoded = base64.b64encode(imgResp.content)
         if len(encoded) < 3000:
             src = "data:" + imgResp.headers[
                 "content-type"] + ";base64," + encoded
         else:
             md5 = hashlib.sha1()
             md5.update(encoded)
             name = md5.hexdigest()
             src = name + "." + src.rpartition(".")[2]
             images.append((src, encoded))
         img.set("src", src)
     data = StringIO()
     data.write(lxml.etree.tostring(doc, pretty_print=True))
     for (name, imageData) in images:
         data.write("\n--data:" + name + "\n" + imageData)
     data.seek(0)
     self.article = data.read()
     self.last_updated = datetime.now()
Пример #7
0
class Article:

    def __init__(self, url):
        print('Saving page: {}'.format(url))
        res = requests.get(url)
        self.url = url
        self.article = Document(res.content)
        self._add_title()
        self._save_images()

    def _add_title(self):
        self.root = etree.fromstring(self.article.summary())
        body = self.root.find('body')

        title = self.article.title()
        ascii_title = unidecode(title) if type(title) == unicode else title

        title_header = etree.HTML('<h2>{}</h2>'.format(ascii_title))
        body.insert(0, title_header)

    def _save_images(self):
        tmppath = tempfile.mkdtemp()
        images = self.root.xpath('//img')
        for img in images:
            imgsrc = img.get('src')

            # handle scheme-agnostic URLs
            if 'http' not in imgsrc and '//' in imgsrc:
                imgsrc = 'http:{}'.format(imgsrc)

            # handle relative file paths
            elif 'http' not in imgsrc:
                parsed = urlparse(self.url)
                imgsrc = '{}://{}{}'.format(parsed.scheme, parsed.netloc, imgsrc)

            filename = os.path.basename(imgsrc)
            dest = os.path.join(tmppath, filename)

            try:
                res = requests.get(imgsrc)
            except Exception as e:
                print('Could not fetch image ({}) from "{}"'.format(str(e), imgsrc))
                return

            if res.status_code == 404:
                print('Could not fetch image (HTTP 404), attempted fetch: "{}", source URL: {}'.format(imgsrc, img.get('src')))
                continue

            with open(dest, 'wb') as f:
                f.write(res.content)

            img.set('src', dest)

    @property
    def title(self):
        return self.article.title()

    @property
    def html(self):
        return etree.tostring(self.root)
Пример #8
0
def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    # boilerpipe_extractor = Extractor(html=html)

    # run readability
    readability_extractor = Document(html)

    html = readability_extractor.summary()
    # return article data
    return {
      'title': readability_extractor.short_title(),
      'html': html,
      'content': strip_tags(html).encode('utf-8', errors='ignore'),
      'url': url
    }

  # otherwise return an empty dict
  else:
    return {}
Пример #9
0
def extract(text):
    soup = BeautifulSoup(text, 'html.parser')  # , from_encoding="utf8")
    aaa = soup.find('li', {'id': 'EntryTag'})
    print aaa
    bbb = soup.find('div', {'id': 'BlogPostCategory'})
    tag_str = ''
    print bbb
    soup1 = soup.find('div', {'id': 'cnblogs_post_body'})
    if soup1:
        try:
            content = str(soup1)
            logging.info('find content in html tag')
        except:
            content = Document(text).summary()
            logging.info('conver soup to string error so via readability',
                         exc_info=True)
    else:
        content = Document(text).summary()
        logging.info('find content via readability')
    try:
        aaaa = aaa.find_all('a')
        tag_list = [i2.get_text for i2 in aaaa]
        tag_str = ','.join(tag_list)
        aaab = bbb.find_all('a')
        tag_list2 = [i2.get_text for i2 in aaab]
        tag_str += ','.join(tag_list2)
    except Exception, e:
        # print Exception, e
        logging.error('cant find keyword in html', exc_info=True)
Пример #10
0
def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    BP = Extractor(html=html)

    # run readability
    Rdb = Document(html)

    html = Rdb.summary()
    # return article data
    return {
      'extracted_title': Rdb.short_title().strip(),
      'extracted_content': strip_tags(BP.getText()),
    }

  # otherwise return an empty dict
  else:
    return {}
Пример #11
0
def getText():
    dataList = []
    for f in os.listdir('unsupervised\\documents'):
        filePath = 'unsupervised\\documents\\' + f
        #print filePath
        fileName, fileExtension = os.path.splitext(filePath)
        #print fileExtension
        if fileExtension.lower() == '.docx':
            print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
            doc = docxDocument(filePath)
            for p in doc.paragraphs:
                dataList.append(p.text)     #print p.text
            #print "-------------------------------"
        elif fileExtension.lower() == '.pdf':
            print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
            #TODO
        elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
            print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
            with codecs.open (filePath, errors='ignore') as myfile:
                source = myfile.read()
                article = Document(source).summary()
                title = Document(source).title()
                soup = BeautifulSoup(article, 'lxml')
                final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
                dataList.append(final)
                #print '*** TITLE *** \n\"' + title + '\"\n'
                #print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
        else:
            print '' # 'undectected document type'
            print '' #"-------------------------------"
    return dataList
Пример #12
0
def contents_scraping(link, remove_space=True, remove_lb=True):
    """Scraping contents.

    Parameter
    ---------
    url : str
      Scraping target url.

    Return
    ------
    list : 
        title and contents.
    """

    try:
        html = urllib.request.urlopen(link).read()
    except:
        print("ERROR : failed to get contents. -> " + link)
        return (False, "")

    title = Document(html).short_title()
    contents = Document(html).summary()
    contents = html2text.html2text(contents)

    p = re.compile(r"<[^>]*?>")
    c = p.sub("", contents)

    if remove_space is True:
        c = c.replace(" ", "")

    if remove_lb is True:
        c = c.replace("\r", "")
        c = c.replace("\n", "")

    return title, c
Пример #13
0
def recommend_by_url(url):
    parsed = urlparse(url)
    doc = Document(requests.get(url).content)
    content = html.fromstring(doc.content()).xpath('string()')
    bigrams = make_bigrams(content)
    vec_bow = dictionary.doc2bow(bigrams)
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    #print sims
    docs = sorted(list(enumerate(sims)), key=lambda item: -item[1])
    results, seen = [], []
    for doc, score in docs:
        res = ARTICLES[doc]
        if not 'url' in res or res['url'] in seen:
            continue
        seen.append(res['url'])
        p = urlparse(res['url'])
        if p.hostname.endswith(parsed.hostname):
            continue
        res['score'] = float(score)
        if 'content' in res:
            del res['content']
        if 'html' in res:
            del res['html']
        if res['summary']:
            res['summary'] = res['summary'].strip()
        results.append(res)
        if len(results) > 14:
            break
    return results
Пример #14
0
class Gist:

    keyword_pattern = re.compile(r'^[^\d]+$')
    stop_words = set(get_stop_words('en'))

    def __init__(self, html):
        self.html = html
        self.document = Document(html)

    @property
    def title(self):
        return self.document.short_title()

    @cached_property
    def text(self):
        text = self.document.summary()
        text = re.sub('<br[^>]+>', '\n', text)
        text = re.sub('</?p[^>]+>', '\n\n', text)
        text = re.sub('<[^>]+>', '', text)
        text = re.sub('^[ \t]+$', '', text)
        text = re.sub('\n{3,}', '\n\n', text, flags=re.MULTILINE)
        return text

    @staticmethod
    def _common_prefix(one, two):
        parallelity = [x == y for x, y in zip(one, two)] + [False]
        return parallelity.index(False)

    @classmethod
    def _find_representative(cls, stem, text):
        tokens = text.split()
        prefixes = {token: cls._common_prefix(token, stem) for token in tokens}
        best = lambda token: (-token[1], len(token[0]))
        return sorted(prefixes.items(), key=best)[0][0]

    @classmethod
    def _is_good_keyword(cls, word):
        return (word not in cls.stop_words) and \
                cls.keyword_pattern.match(word)

    @classmethod
    def find_keywords(cls, text):
        whoosh_backend = SearchForm().searchqueryset.query.backend
        if not whoosh_backend.setup_complete:
            whoosh_backend.setup()
        with whoosh_backend.index.searcher() as searcher:
            keywords = searcher.key_terms_from_text(
                'text', text, numterms=10, normalize=False)
        keywords = list(zip(*keywords))[0] if keywords else []
        keywords = [cls._find_representative(keyword, text) for keyword in keywords]
        keywords = [keyword for keyword in keywords if cls._is_good_keyword(keyword)]
        #no double keywords in list
        keywords = list(set(keywords))
        #no punctuation in suggested keywords
        keywords = [''.join(c for c in s if c not in string.punctuation) for s in keywords]
        return keywords

    @property
    def keywords(self):
        return self.find_keywords(self.text)
Пример #15
0
def get_announcement_body(url):

        now = datetime.datetime.now()
        resp = ["","","","","",""]
        images = []
        html = br.open(url).read()

        readable_announcement = Document(html).summary()
        readable_title = Document(html).title()
        soup = BeautifulSoup(readable_announcement, "lxml")
        final_announcement = soup.text
        links = soup.findAll('img', src=True)
        for lin in links:
                li = urlparse.urljoin(url,lin['src'])
                images.append( li)
                
        resp[0] = str(final_announcement.encode("ascii","ignore"))
        resp[1] = str(readable_title.encode("ascii","ignore"))
        resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second)
        resp[3] = url
        resp[4] = url
        resp[5] = ""
        #insertDB(resp)
        #print "inserted resp"
                 
        title_article = []
        title_article.append(final_announcement)
        title_article.append(readable_title)
        title_article.append(images)                
        return title_article
Пример #16
0
def process(doc, params):
    url = params['url']
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir='/tmp/')
            img_src = urljoin(url, img.get('src'))
            img_name = None
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                img_name = get_filename_from_url(img_src)
                write_file(r, fp)
            else:
                img_meta, content = img_src.split(',')
                image = base64.b64decode(content)
                img_name = get_filename_from_base64(img_meta)
                fp.write(image)
            images.append((img_name, fp))
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {'url': url}},
            )

    html = '<h1>' + title + '</h1>' + summary
    html = '<p>{}</p>'.format(html)

    text = html2text.html2text(html)
    return text, images, 1, None
 def get_data(url):
     error_num = 0
     while True:
         if error_num >= 10:
             cprint("Finished Because error_num reached 10 times", "red")
             return 0, 0
         try:
             req = requests.get(url)
             if int(req.status_code) == 503:
                 cprint("Google detected the abnormal network traffic", "red")
                 time.sleep(60 * 60)
             elif int(req.status_code) != 200:
                 cprint("Now Get StatusCode{}: Error_num{}".format(req.status_code, error_num), "red")
                 return 0, 0
             else:
                 html = req.text
                 break
         except ConnectionError:
             cprint("Now Get ConnectionError: Error_num{}".format(error_num), "red")
             error_num += 1
             time.sleep(5)
     try:
         document = Document(html)
         content_html = document.summary()
         content_text = lxml.html.fromstring(content_html).text_content().strip()
         short_title = document.short_title()
         return short_title, content_text
     except:
         return 0, 0
Пример #18
0
def get_webpage_by_html(url, html=None):
    html = get_html_str(url, html)
    summary_obj = predefined_site(url, html)
    article = video_site(url)
    if summary_obj is None:
        doc = Document(html, url=url, debug=True, multipage=False)
        summary_obj = doc.summary_with_metadata(enclose_with_html_tag=False)
    title = summary_obj.short_title
    if article is None:
        article = summary_obj.html
    from urllib.parse import urlparse
    webpage = Webpage()
    webpage.url = url
    webpage.domain = urlparse(url).hostname
    webpage.title = title
    webpage.favicon = ""
    webpage.top_image = None
    webpage.excerpt = summary_obj.description
    webpage.author = None
    webpage.content = article
    webpage.tags = get_suggest_tags(title, article, summary_obj.keywords)
    webpage.movies = []
    webpage.raw_html = html
    webpage.publish_date = None
    webpage.segmentation = get_segmentation(title, article)
    return webpage.__dict__
def crawl(site, depth, linksfile):
    pattern = re.compile(r'href="(http://.*?)"')
    f = open(linksfile, 'a+')
    try:
        if depth < MAX_DEPTH:
            print 'crawling [%s]...' % site,
            print >> f, '[%s]' % site

            br = mechanize.Browser()
            br.set_handle_robots(False)
            br.addheaders = [('User-agent', 'Firefox')]
            url = br.open(site)
            content = url.read()

            hits = pattern.findall(content)
            for hit in hits:
                print >> f, hit
                url2 = br.open(hit)
                content2 = url.read()
                readable_article = Document(content2).summary()
                readable_title = Document(content).short_title()
                soup = BeautifulSoup(readable_article)
                final_article = soup.text
                links = soup.findAll('img', src=True)
                print final_article

            print 'done.'
            print >> f, ''

            for hit in hits:

                crawl(hit, depth + 1, linksfile)
    except:
        pass
    f.close()
Пример #20
0
 def loadFromWeb(cls,url):   
     html = requests.get(url).content
     readable_article = Document(html).summary()
     readable_title = Document(html).short_title()
     cleantext = BeautifulSoup(readable_article).text
     cleantext = HTMLParser.HTMLParser().unescape(cleantext)
     return cleantext
Пример #21
0
    async def enrich(self, result):
        if not self.soup:
            return result

        result.set('title', self.soup.title.string, 0, 'textlength')

        if result.has('content'):
            return result

        parts = []
        for txt in self.soup.find_all("noscript"):
            if txt.string is not None:
                parts.append(txt.string)
        html = " ".join(parts).strip()
        if not html:
            html = self.soup.all_text()

        try:
            doc = Document(html, url=self.url)
            content = doc.summary(html_partial=True)
            result.set('content', sanitize_html(content))
        # pylint: disable=bare-except
        except:
            pass

        return result
Пример #22
0
def process(doc, url):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR)
            img_src = urljoin(url, img.get('src'))
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                write_file(r, fp)
            else:
                image = base64.b64decode(img_src.split(',')[1])
                fp.write(image)
            images.append(fp)
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {
                    'url': url
                }},
            )

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images, 1
Пример #23
0
def main():
    novels = {
        'cbi': 'https://boxnovel.com/novel/castle-of-black-iron/chapter-',
        'sgg': 'https://boxnovel.com/novel/super-gene/chapter-',
        'sas': 'https://boxnovel.com/novel/strongest-abandoned-son/chapter-',
        'atg': 'https://www.wuxiaworld.com/novel/against-the-gods/atg-chapter-'
    }
    total = []
    if len(sys.argv) < 4:
        inicio = int(sys.argv[2])
        fim = int(sys.argv[2]) + 1
    else:
        inicio = int(sys.argv[2])
        fim = int(sys.argv[3]) + 1

    url = novels[sys.argv[1]]
    for i in range(inicio, fim):
        response = getPage(url + str(i))
        doc = Document(response.text)
        fileName = re.sub(r'[^a-zA-Z0-9]+', ' ', doc.title())
        total.append(doc.summary())
        print(i)

    f = open(fileName + str(fim - 1) + '.html', 'w')
    for i in total:
        f.write(i)
    f.close()
Пример #24
0
    def _parse_article(self, response):
        feed_entry = response.meta["feed_entry"]

        il = FeedEntryItemLoader(parent=response.meta["il"])
        try:
            response.text
        except AttributeError:
            # Response is not text (e.g. PDF, ...).
            il.add_value("title", feed_entry.get("title"))
            il.add_value("content_html", feed_entry.get("summary"))
            return il.load_item()

        doc = Document(response.text, url=response.url)
        il.add_value("title", doc.short_title() or feed_entry.get("title"))
        summary = feed_entry.get("summary")
        try:
            content = doc.summary(html_partial=True)
            if summary and len(summary) > len(content):
                # Something probably went wrong if the extracted content is shorter than
                # the summary.
                raise Unparseable
        except Unparseable:
            content = summary
        il.add_value("content_html", content)

        return il.load_item()
Пример #25
0
    def getTextFromHTML(self, url_id):
        """ Runs Readability (Document) on the HTML text
        """
        html_row = get_html(self.pg_conn, url_id)

        if not html_row or 'html' not in html_row:
            return False

        if html_row['readabletext'] and html_row['readabletext'] != '':
            return html_row['readabletext']

        html = html_row['html']

        try:
            html_summary = Document(html).summary(html_partial=True)
            html_summary = html_summary.replace('\n','').replace('\t','')

            if len(html_summary) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
                return False

            raw_text = lxml.html.document_fromstring(html_summary).text_content()
        except:
            raw_text = False

        if raw_text:
            save_readabletext(self.pg_conn, url_id, raw_text, 'meta')
        else:
            save_readabletext(self.pg_conn, url_id, '', 'meta')

        return raw_text
Пример #26
0
def get_webpage_by_html(url, html=None):
    html = get_html_str(url, html)
    summary_obj = predefined_site(url, html)
    article = video_site(url)
    if summary_obj is None:
        doc = Document(html, url=url, debug=True, multipage=False)
        summary_obj = doc.summary_with_metadata(enclose_with_html_tag=False)
    title = summary_obj.short_title
    if article is None:
        article = summary_obj.html
    from urllib.parse import urlparse
    webpage = Webpage()
    webpage.url = url
    webpage.domain = urlparse(url).hostname
    webpage.title = title
    webpage.favicon = ""
    webpage.top_image = None
    webpage.excerpt = summary_obj.description
    webpage.author = None
    webpage.content = article
    webpage.tags = get_suggest_tags(title, article, summary_obj.keywords)
    webpage.movies = []
    webpage.raw_html = html
    webpage.publish_date = None
    webpage.segmentation = get_segmentation(title, article)
    return webpage.__dict__
Пример #27
0
    def getTextFromHTML(self, url_id):
        """ Runs Readability (Document) on the HTML text
        """
        html_row = get_html(self.pg_conn, url_id)

        if not html_row or 'html' not in html_row:
            return False

        if html_row['readabletext'] and html_row['readabletext'] != '':
            return html_row['readabletext']

        html = html_row['html']

        try:
            html_summary = Document(html).summary(html_partial=True)
            html_summary = html_summary.replace('\n', '').replace('\t', '')

            if len(
                    html_summary
            ) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
                return False

            raw_text = lxml.html.document_fromstring(
                html_summary).text_content()
        except:
            raw_text = False

        if raw_text:
            save_readabletext(self.pg_conn, url_id, raw_text, 'meta')
        else:
            save_readabletext(self.pg_conn, url_id, '', 'meta')

        return raw_text
Пример #28
0
def reada(url, cache=True):

	if cache:
		cached = memcache.get(key=url)
		if cached is not None:
			return cached

	#file = urllib.urlopen(url)
        #import urllib2
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        file = opener.open(url)
        ##
	enc = 'utf-8'
	text = ''
	try:
		# 1, web html 2 readability
		raw = Document(file.read(), url=url)
		html = raw.summary().encode(enc, 'replace')
		title = raw.short_title()

		# 2, readability 2 markdown, copy from main
		data = html.decode(enc)
		h = html2text.HTML2Text(baseurl=url)
		h.ignore_images = False
		h.body_width = 100000
		text = h.handle(data)
	finally:
		file.close()

	d = {'url': url, 'title': title, 'content': text}
	if cache:
		memcache.add(key=url, value=d, time=600)
	return d
Пример #29
0
    def getTitleAndContent(self, contentUrl):
        myHeader = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:55.0) Gecko/20100101 Firefox/55.0',
        }
        try:
            r = self.http.request('GET', contentUrl, headers=myHeader)
            # print(r.status)  # 200
            # 获得html源码,utf-8解码
            # print(r.data.decode())
            html = r.data
            readable_tilte = Document(html).short_title()
            readable_article = Document(html).summary()
            content = self.ht.handle(readable_article)
            # content = re.sub(r'阅读剩余全文()|该菜谱创建于[\s\S]+任何部分的内容。|(更多相关资讯请关注:|用手机访问|1[\s\d]+\s下一页|\*\s|精美图片)[\s\S]+|(新闻热线:[\s\S]+)#', '', content)

            response = etree.HTML(html)
            # content = response.xpath("string(//div[@class='text-3zQ3cZD4'])")
            # content = re.sub(
            #     r'图集|(\+1\s|【纠错】)[\s\S]+', '',
            #     content).strip()
            # script = response.xpath("//script")[5].text
            # response = re.findall('contentList":([\s\S]+),"currentPage', script)[0]
            # datas = json.loads(response)[0]
            # strData = datas['data']

            # pat = re.compile('<[^>]+>', re.S)
            # content = pat.sub('', strData)
            # content = ''.join(content).replace(u'\u3000', '').replace(u'\xa0','').strip()
            data = dict()
            data["title"] = readable_tilte
            data["content"] = content

            return self.return_data(0, "success", data)
        except Exception as e:
            return self.return_data(1, e)
Пример #30
0
def getContent(url):

    print '@@ start crawl %s @@@' % url

    html = getHTml(url)
    '''readability介入分析'''
    readable_article = Document(html).summary()
    readable_title = Document(html).short_title()

    a = re.sub(r'<script[\s\S]*?</script>|&#13;', '', readable_article).strip()
    b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>', '', a).strip()
    c = re.sub(r'<p[^>]*?>', '<p>', b).strip().replace('\n', '')
    d = re.sub(r'<p>\s+<p>', '', c)

    # 统计中文字数
    num = number(b)

    if num > 100:

        #sql = '''INSERT INTO newbaidu_detail_contont VALUES ('%s','%s','%s','%s')''' % (url,readable_title,d,current_date)
        getc = url + '\n' + readable_title + '\n' + d + '\n' + current_date + '\n'

        try:
            with open('news/' + readable_title + '.txt', 'w') as f2:
                f2.write(getc)
            print '执行成功'
        except Exception, e:
            print '执行失败,%s' % e

        return '成功'
Пример #31
0
def main():
    #print 'Hello there'
    # Command line args are in sys.argv[1], sys.argv[2] ...
    # sys.argv[0] is the script name itself and can be ignored

    dataList = []

    for f in os.listdir('documents'):
        filePath = 'documents\\' + f
        #print filePath
        fileName, fileExtension = os.path.splitext(filePath)
        #print fileExtension
        if fileExtension.lower() == '.docx':
            print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
            doc = docxDocument(filePath)
            for p in doc.paragraphs:
                dataList.append(p.text)     #print p.text
            #print "-------------------------------"
        elif fileExtension.lower() == '.pdf':
            print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
            # with open(filePath) as f:
            #     doc = slate.PDF(f)
            #     print doc[1]
            #     exit()


            #TODO
        elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
            print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
            with codecs.open (filePath, errors='ignore') as myfile:
                source = myfile.read()
                article = Document(source).summary()
                title = Document(source).title()
                soup = BeautifulSoup(article, 'lxml')
                final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
                dataList.append(final)
                #print '*** TITLE *** \n\"' + title + '\"\n'
                #print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
        else:
            print '' # 'undectected document type'
            print '' #"-------------------------------"

    #print dataList
    #for i in dataList:
    #    print i
    cachedStopWords = stopwords.words("english")
    combined = ' '.join(dataList)

    #print combined
    bloblist = [tb(combined)]

    for i, blob in enumerate(bloblist):
        print("Top words in document {}".format(i + 1))
        scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')}
        #print scores
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        #print sorted_words
        for word, score in sorted_words:
            print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
Пример #32
0
def download_via_url(url):
    response = requests.get(url)
    doc = Document(response.text)
    title = doc.title()
    summary = doc.summary()
    soup = BeautifulSoup(summary, "html.parser")

    return title, soup.text
 def extract(self, html):
     # https://github.com/buriy/python-readability/blob/master/readability/readability.py
     doc = Document(html)
     self.__title = doc.title()
     self.__html = doc.summary()
     self.__md = html2text.html2text(self.__html)
     self.__text = self.__format_to_text(self.__html)
     return self.__text
Пример #34
0
def process(doc):
    html_body = Document(doc)

    summary = html_body.summary()
    title = html_body.short_title()

    text = text_maker.handle(summary)
    return title, text
Пример #35
0
def main():
    html = urllib.urlopen("http://habrahabr.ru/post/150756/").read()
    doc = Document(html)
    short_title = doc.short_title()
    readable_article = doc.summary()
    f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb")
    f.write(readable_article.encode("utf-8"))
    f.close()
Пример #36
0
def main():
    html = urllib.urlopen("http://habrahabr.ru/post/150756/").read()
    doc = Document(html)
    short_title = doc.short_title()
    readable_article = doc.summary()
    f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb")
    f.write(readable_article.encode("utf-8"))
    f.close()
Пример #37
0
 def parse(self, response):
     doc = Document(response.text)
     yield {
         'full_title': doc.title(),
         # 'date': response.selector.xpath('//time/@datetime').getall()
         # 'date': response.xpath('//span[@class="post-date"]/text()').get()
         'date': '2009'
     }
Пример #38
0
 def _getResponseText(self, response):
     '''
     (reponse) -> Text
     Returns text within the body of an HttpResponse object.
     '''
     readability = Document(response.body)
     content = readability.title() + readability.summary()
     return content
Пример #39
0
def get_main_content(html):
    readable_title = Document(html).short_title()
    readable_article = Document(html).summary()
    text_p = re.sub(r'</?div.*?>', '', readable_article)
    text_p = re.sub(r'((</p>)?<a href=.*?>|</a>(<p>)?)', '', text_p)
    text_p = re.sub(r'<select>.*?</select>', '', text_p)

    return readable_title, text_p
Пример #40
0
def checkerFunction(myInput):
	today = datetime.date.today()
	try:
		google1 = 'http://www.google.com/search?hl=en&q='
		google2 = '%20privacy%20policy&btnI=1'
		keyword = myInput
		
		url = google1 + keyword + google2
		r = requests.get(url, allow_redirects=False)
		url = r.headers['location']
	except Exception as e:
		return


	
	myFullPath = "./sandbox/db/" + keyword

	if not os.path.exists("./sandbox"):
    	  os.makedirs("./sandbox")

	if not os.path.exists("./sandbox/db/"):
      	  os.makedirs("./sandbox/db/")

	if not os.path.exists(myFullPath):
    	  os.makedirs(myFullPath)

	filename = keyword + "." + str(today)
	filetowrite = myFullPath + "/" + filename
	
	fileExist =  os.path.isfile(filetowrite)
	
	
	
	
	if (url == None):
		return
	html = urllib.urlopen(url).read()
	readable_article = Document(html).summary()
	tempFileMade = False
	originalFileMade = False
	if(fileExist):
		filetowrite = filetowrite + ".tmp."
		f = open(filetowrite, 'w')
		writeThis = str(readable_article.encode('ascii', 'ignore')) 
		f.write(writeThis)
		f.close
		tempFileMade = True
	else:
		f = open(filetowrite, 'w')
		writeThis = str(readable_article.encode('ascii', 'ignore'))
		f.write(writeThis)
		f.close
		originalFileMade = True
	
	hashedmd5 = hashlib.md5(readable_article.encode('ascii', 'ignore'))
	hashedArticle = hashedmd5.hexdigest()
	return hashedArticle	
Пример #41
0
 def get_article_from_item(self, item):
     url = item['link']
     logging.debug(url)
     author = 'n/a'
     if item.has_key('author'):
         author = item.author
     html = urllib.urlopen(url).read()
     doc = Document(html)
     return Article(doc.title(), doc.short_title(), author, doc.summary())
Пример #42
0
def news():
    search = request.args.get('q')
    if request.args.get('count') : 
        count = request.args.get('count') 
    else : 
        count=10
    if request.args.get('offset') : 
        offset = request.args.get('offset') 
    else : 
        offset=0

    if search:
        headers = {'Ocp-Apim-Subscription-Key': 'd94125558b884a309dd71f9e1aa8b9fb'}
        params = urllib.parse.urlencode({
            'q': search,
            'count': count,
            'offset': offset,
            'mkt': 'en-id',
            'safesearch': 'Moderate',
        })

        try:
            conn_url = http.client.HTTPSConnection('api.cognitive.microsoft.com')
            conn_url.request("GET", "/bing/v7.0/news/search?%s" % params, "{body}", headers)
            response = conn_url.getresponse()
            data = response.read().decode('utf-8')
            data_array  = json.loads(data)
            conn_url.close()
        except Exception as e:
            print("[Errno {0}] {1}".format(e.errno, e.strerror))

        print(data_array)

        i=0
        for result in data_array['value']:
            try:
                response = requests.get(result['url'], verify=False, allow_redirects=False)
            except requests.exceptions.ConnectionError:
                print(result['url'], "Connection refused")
                response = requests.get("https://pens.ac.id", verify=False)
                
            print(result['url'])
            doc = Document(response.content)
            raw = BeautifulSoup(doc.summary(html_partial=True), 'html.parser').get_text()
            result['sentiment'] = int(getSentiment(raw))
            print("SENTIMENT : ", result['sentiment'])
            result['status'] = analyze(raw)
            result['id_rank'] = i
            if result['datePublished']: 
                result['datePublished'] = parser.parse(result['datePublished'])
                result['datePublished'] = result['datePublished'].strftime('Diterbitkan pada %d %b %Y pukul %I:%M WIB')
                print(result['datePublished'])
            i+=1
        
        return render_template("news.html", data=data_array)
    else:
        return render_template("news.html")
def getReadability(url):
    #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html'
    try:
        html = urllib.urlopen(url).read()
        readable_article = Document(html).summary().replace('\n', '')
        readable_title = Document(html).short_title()
        return readable_title, readable_article
    except Exception, e:
        return '', ''
Пример #44
0
def crawl_url(url):
    html = requests.get(url)
    doc = Document(html.content)
    content = doc.summary().encode('utf-8')
    title = doc.title().encode('utf-8')
    return {
        'content': content,
        'title': title
    }
Пример #45
0
 def get_article_from_item(self, item):
     url = item['link']
     logging.debug(url)
     author = 'n/a'
     if item.has_key('author'):
         author = item.author
     html = urllib.urlopen(url).read()
     doc = Document(html)
     return Article(doc.title(), doc.short_title(), author, doc.summary())
Пример #46
0
def get_article (url, referrer=None):
    """Fetch the html found at url and use the readability algorithm
    to return just the text content"""

    html = load_url(url, referrer)
    if html is not None:
        doc_html = Document(html).summary(html_partial=True)
        clean_html = doc_html.replace('&amp;', u'&').replace(u'&#13;', u'\n')
        return BeautifulSoup(clean_html).getText(separator=u' ').replace(u'  ', u' ')
Пример #47
0
def scrape(url,
           pdf_filename,
           pdf_page_size=PDF_PAGE_SIZE,
           folder=OUTPUT_FOLDER,
           clean_it=True,
           css_file=EPUB_CSS,
           lang=EPUB_LANG,
           cover_image=EPUB_COVER,
           isbn=None):
    """Fetch the html content at url and convert it to a pdf file,
    cleaned by readability and framed in an easy-to-read format if
    clean_it is True"""

    raw_html = get_url(url)
    if raw_html is None:
        print "Sorry, could not read ", url
    else:
        filename_prefix, file_ext = os.path.splitext(pdf_filename)
        if clean_it:
            # use readability to get rid of crap
            title = Document(raw_html).short_title()
            content = Document(raw_html).summary(html_partial=True)

            # write the cleaned contents to an html frame for pdf conversion
            frame = HTML_FRAME.substitute(content=to_unicode(content),
                                          url=url,
                                          title=title)

            # unlike pdf, epub is controlled by css, so save the cleaned html alone
            epub_source = write_file(
                folder, os.extsep.join([filename_prefix + '_epub', 'html']),
                to_unicode(content))
            pdf_source = write_file(folder,
                                    os.extsep.join([filename_prefix, 'html']),
                                    frame)

        else:
            title = filename_prefix
            # no readability cleaning requested, so use the fetched html as-is
            epub_source = write_file(
                folder, os.extsep.join([filename_prefix + '_epub', 'html']),
                to_unicode(raw_html))
            pdf_source = write_file(folder,
                                    os.extsep.join([filename_prefix, 'html']),
                                    to_unicode(raw_html))

        if epub_source:
            generate_epub(
                folder, filename_prefix, title,
                os.path.join(
                    folder, os.extsep.join([filename_prefix + '_epub',
                                            'html'])), css_file, cover_image,
                lang, isbn)

        if pdf_source:
            generate_pdf(folder, filename_prefix, pdf_page_size)
Пример #48
0
def body_via_readability(page_html, source_url):
    """
    Readbility is good at article + title.
    """

    obj = Document(page_html)
    body = obj.summary()
    if not body:
        return None
    return html.prepare(body, source_url)
Пример #49
0
def fetch_url(url):
    '''
    get url with readability
    '''
    html = basic_fetch_url(url)
    readable_article = Document(html).summary()
    title = Document(html).short_title()
    text = BeautifulSoup(readable_article).get_text()

    return title, text
Пример #50
0
 def readability_extractor(self, html):
     try:
         doc = Document(html)
         content = doc.summary()
         if content and content != "":
             return content
         else:
             return self.html2text_extractor(html)
     except:
         return self.html2text_extractor(html)
Пример #51
0
 def extract_article(self):
     """Returns only readable content
     Returns:
         data - {
             'title': 'Title of the article',
             'content': 'HTML body of the article'
         }
     """
     doc = Document(self._html)
     return {'title': doc.title(), 'content': doc.summary()}
Пример #52
0
 def extract_data(self, patchurl):
     try:
         f = requests.get(patchurl)
         html = f.content
         doc = Document(html)
         title = doc.short_title()
         summary = doc.summary()
         return smart_str(title), smart_str(summary)
     except:
         return None, None
Пример #53
0
def extract_by_readability(html):
    document = Document(html)

    def strip_html(html):
        return re.sub(r'<[^<]+?>', '', html)

    return {
        'title': ensure_unicode(document.short_title()),
        'body': strip_html(ensure_unicode(document.summary())),
    }
Пример #54
0
def decode_doc(doc, url):
    #print('doc')
    cs = re.compile(b'^<(meta|META).*charset=("|\')?([^ "\']*)')
    pkey = re.compile(b'^<(meta|META).*keywords.*content=("|\')?([^ "\']*)')
    codec = None
    keywords = None
    #print(*doc)
    for l in doc :
        if (l.startswith(b'<meta') or l.startswith(b'<META')) :
            if codec is None and (b'charset' in l) :
                m = cs.match(l)
                codec = m.group(3).decode()
            if keywords is None and b'keywords' in l :
                m = pkey.match(l)
                if m :
                    keywords = m.group(3)


    sdoc = []
    for l in doc :
        try :
            l = l.decode(codec)
        except :
            l = ''
        sdoc.append(l)

    try :
        if keywords :
            keywords = keywords.decode(codec)
        else :
            #print(*sdoc, sep = '\n')
            keywords = ''
        keywords = re.split(r'[ ,;\|]',keywords)
        #print(keywords.encode('utf8'))
    except :
        pass

    #if sum(len(x) for x in sdoc) < 1000 : return
    doc = '\n'.join(sdoc)
    #if len(doc) < 1000 :return
    try :
        doc = Document(doc)
        title = doc.short_title()
        content = doc.summary()
    except :
        return
    #print(doc.summary().encode('utf8'))
    #print(doc.short_title().encode('utf8'))


    data = {"url":url, 
            'keywords':keywords,
            'title': title,
            'content':content}
    return data
Пример #55
0
def main():
    html = open('./samples/21853124_0.shtml').read()
    doc = Document(html)
    doc.transform()
    doc.get_publish_date()
    doc.short_title()
    doc.text_content()
Пример #56
0
 def _update(self, response):
   app.logger.debug("Updating %s" % response.url)
   data = Document(response.text).summary()
   doc = lxml.html.fromstring(data)
   images = []
   imageElems = doc.xpath("//img")
   app.logger.debug("%d images for %s",len(imageElems), response.url)
   for img in imageElems:
     src = urlparse.urljoin(response.url, img.get("src"))
     imgResp = requests.get(src)
     encoded = base64.b64encode(imgResp.content)
     if len(encoded) < 3000:
       src = "data:" + imgResp.headers["content-type"] + ";base64," + encoded
     else:
       md5 = hashlib.sha1()
       md5.update(encoded)
       name = md5.hexdigest()
       src = name +"." + src.rpartition(".")[2]
       images.append((src, encoded))
     img.set("src", src)
   data = StringIO()
   data.write(lxml.etree.tostring(doc, pretty_print=True))
   for (name, imageData) in images:
     data.write("\n--data:"+name+"\n"+imageData)
   data.seek(0)
   self.article = data.read()
   self.save()
Пример #57
0
def get_main_text(html):
    main_text = Document(html).summary()
    main_text = BeautifulSoup(main_text).getText()
    # 处理空行
    r = re.compile(r'\n+', re.M | re.S)
    main_text = r.sub('\n', main_text)
    # 去除首行回车
    if main_text.find('\n') == 0:
        main_text = main_text.replace('\n', '', 1)

    return main_text
Пример #58
0
    def parse_news_content(self, response):
        for link in self.full_article_link_extractor.extract_links(response):
            request = response.request.replace(url=link.url)
            yield request

        item = self._create_item(response)
        if item is not None:
            doc = Document(response.body)
            item['title'] = doc.short_title()
            item['content'] = html2text.html2text(doc.summary())
            yield item
Пример #59
0
def tell_url(un, url):
    buff = urllib2.urlopen(url)
    doc = Document(buff.read())
    html_buff = doc.summary()
    text_buff = extract_text(html_buff)
    class_name = un.tell_buff(text_buff)
    if class_name:
        class_name_human = un.get_class_name_human(class_name)
    else:
        class_name_human = None
    
    return class_name_human