示例#1
0
def main():
    html = open('./samples/21853124_0.shtml').read()
    doc = Document(html)
    doc.transform()
    doc.get_publish_date()
    doc.short_title()
    doc.text_content()
    def strip_chapter(self, html):
        """
        Strips chapter and gets relevant HTML using Readability
        :param html: str
        :return:
        """
        doc = Document(html)
        if len(doc.summary()) <= 20:
            content = str(BeautifulSoup(html, 'html.parser').find_all('div', class_=self.main_content_div)[0])
            content = '<html><head><meta charset="utf-8"></head>' + content + '</html>'
            return doc.short_title(), content

        return (doc.short_title(),
                str(doc.summary()).replace('<html>', '<html><head><meta charset="utf-8"></head>'))
示例#3
0
def read_command(api, args):
    from readability.readability import Document
    import html2text
    h = html2text.HTML2Text()
    h.inline_links = False
    h.ignore_images = True
    h.ignore_emphasis = True
    res = requests.get(args.url)
    if res.ok:
        article = Document(res.content)
        print article.short_title()
        print h.handle(article.summary())
    else:
        print res.headers['status']
    def strip_chapter(self, html):
        """
        Strips chapter and gets relevant HTML using Readability
        :param html: str
        :return:
        """
        doc = Document(html)
        if len(doc.summary()) <= 20:
            print 'This page has errors, returning entry-content div raw HTML.'
            content = str(BeautifulSoup(html, 'html.parser').find_all('div', class_=self.main_content_div)[0])
            content = '<html><head><meta charset="utf-8"></head>' + content + '</html>'
            return doc.short_title(), content

        return (doc.short_title(),
                str(doc.summary()).replace('<html>', '<html><head><meta charset="utf-8"></head>'))
示例#5
0
def extract_article(url):
    r = requests.get(url)

    # the the url exists, continue
    if r.status_code == 200:

        # extract and parse response url
        url = parse_url(r.url)

        # extract html
        html = r.content.decode('utf-8', errors='ignore')

        # run boilerpipe
        # boilerpipe_extractor = Extractor(html=html)

        # run readability
        readability_extractor = Document(html)

        html = readability_extractor.summary()
        # return article data
        return {
            'title': readability_extractor.short_title(),
            'html': html,
            'content': strip_tags(html).encode('utf-8', errors='ignore'),
            'url': url
        }

    # otherwise return an empty dict
    else:
        return {}
 def get_data(url):
     error_num = 0
     while True:
         if error_num >= 10:
             cprint("Finished Because error_num reached 10 times", "red")
             return 0, 0
         try:
             req = requests.get(url)
             if int(req.status_code) == 503:
                 cprint("Google detected the abnormal network traffic", "red")
                 time.sleep(60 * 60)
             elif int(req.status_code) != 200:
                 cprint("Now Get StatusCode{}: Error_num{}".format(req.status_code, error_num), "red")
                 return 0, 0
             else:
                 html = req.text
                 break
         except ConnectionError:
             cprint("Now Get ConnectionError: Error_num{}".format(error_num), "red")
             error_num += 1
             time.sleep(5)
     try:
         document = Document(html)
         content_html = document.summary()
         content_text = lxml.html.fromstring(content_html).text_content().strip()
         short_title = document.short_title()
         return short_title, content_text
     except:
         return 0, 0
示例#7
0
class Gist:

    keyword_pattern = re.compile(r'^[^\d]+$')
    stop_words = set(get_stop_words('en'))

    def __init__(self, html):
        self.html = html
        self.document = Document(html)

    @property
    def title(self):
        return self.document.short_title()

    @cached_property
    def text(self):
        text = self.document.summary()
        text = re.sub('<br[^>]+>', '\n', text)
        text = re.sub('</?p[^>]+>', '\n\n', text)
        text = re.sub('<[^>]+>', '', text)
        text = re.sub('^[ \t]+$', '', text)
        text = re.sub('\n{3,}', '\n\n', text, flags=re.MULTILINE)
        return text

    @staticmethod
    def _common_prefix(one, two):
        parallelity = [x == y for x, y in zip(one, two)] + [False]
        return parallelity.index(False)

    @classmethod
    def _find_representative(cls, stem, text):
        tokens = text.split()
        prefixes = {token: cls._common_prefix(token, stem) for token in tokens}
        best = lambda token: (-token[1], len(token[0]))
        return sorted(prefixes.items(), key=best)[0][0]

    @classmethod
    def _is_good_keyword(cls, word):
        return (word not in cls.stop_words) and \
                cls.keyword_pattern.match(word)

    @classmethod
    def find_keywords(cls, text):
        whoosh_backend = SearchForm().searchqueryset.query.backend
        if not whoosh_backend.setup_complete:
            whoosh_backend.setup()
        with whoosh_backend.index.searcher() as searcher:
            keywords = searcher.key_terms_from_text(
                'text', text, numterms=10, normalize=False)
        keywords = list(zip(*keywords))[0] if keywords else []
        keywords = [cls._find_representative(keyword, text) for keyword in keywords]
        keywords = [keyword for keyword in keywords if cls._is_good_keyword(keyword)]
        #no double keywords in list
        keywords = list(set(keywords))
        #no punctuation in suggested keywords
        keywords = [''.join(c for c in s if c not in string.punctuation) for s in keywords]
        return keywords

    @property
    def keywords(self):
        return self.find_keywords(self.text)
def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    # boilerpipe_extractor = Extractor(html=html)

    # run readability
    readability_extractor = Document(html)

    html = readability_extractor.summary()
    # return article data
    return {
      'title': readability_extractor.short_title(),
      'html': html,
      'content': strip_tags(html).encode('utf-8', errors='ignore'),
      'url': url
    }

  # otherwise return an empty dict
  else:
    return {}
示例#9
0
def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    BP = Extractor(html=html)

    # run readability
    Rdb = Document(html)

    html = Rdb.summary()
    # return article data
    return {
      'extracted_title': Rdb.short_title().strip(),
      'extracted_content': strip_tags(BP.getText()),
    }

  # otherwise return an empty dict
  else:
    return {}
示例#10
0
def markdownify(url_list, **options):
    articles = []
    images = []
    paragraph_links = options['paragraph_links']
    wrap_text = options['wrap_text']
    preamble = options['preamble']
    for url in url_list:
        req = urllib2.Request(url,None,{'Referer': url_list[0]})
        html = urllib2.urlopen(req).read()
        document = Document(html, url=url)
        readable_title = document.short_title()
        summary = document.summary()
        summary_doc = build_doc(summary)
        images.extend([a.get('src') for a in summary_doc.findall('.//img')])
        articles.append(document.summary())

    markdown_articles = []
    for (article, url) in zip(articles, url_list):
        h = html2text.HTML2Text(baseurl=url)
        h.inline_links = False
        h.links_each_paragraph = (paragraph_links and 1) or 0
        h.body_width = (wrap_text and 78) or 0
        markdown_articles.append(h.handle(article))
    combined_article = u"\n\n----\n\n".join(markdown_articles)
    if preamble:
        combined_article = (u"Title:        %s  \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article
    return combined_article.encode("utf-8")
示例#11
0
def process(doc, params):
    url = params['url']
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir='/tmp/')
            img_src = urljoin(url, img.get('src'))
            img_name = None
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                img_name = get_filename_from_url(img_src)
                write_file(r, fp)
            else:
                img_meta, content = img_src.split(',')
                image = base64.b64decode(content)
                img_name = get_filename_from_base64(img_meta)
                fp.write(image)
            images.append((img_name, fp))
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {'url': url}},
            )

    html = '<h1>' + title + '</h1>' + summary
    html = '<p>{}</p>'.format(html)

    text = html2text.html2text(html)
    return text, images, 1, None
示例#12
0
def get_summary(url):
    html = urllib.request.urlopen(url).read()
    doc = Document(html)
    doc.parse(["summary", "short_title"])
    readable_article = doc.summary()
    readable_title = doc.short_title()
    return readable_article, readable_title
示例#13
0
文件: html.py 项目: the-deep/server
def process(doc, url):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR)
            img_src = urljoin(url, img.get('src'))
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                write_file(r, fp)
            else:
                image = base64.b64decode(img_src.split(',')[1])
                fp.write(image)
            images.append(fp)
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {
                    'url': url
                }},
            )

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images, 1
示例#14
0
def reada(url, cache=True):

	if cache:
		cached = memcache.get(key=url)
		if cached is not None:
			return cached

	#file = urllib.urlopen(url)
        #import urllib2
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        file = opener.open(url)
        ##
	enc = 'utf-8'
	text = ''
	try:
		# 1, web html 2 readability
		raw = Document(file.read(), url=url)
		html = raw.summary().encode(enc, 'replace')
		title = raw.short_title()

		# 2, readability 2 markdown, copy from main
		data = html.decode(enc)
		h = html2text.HTML2Text(baseurl=url)
		h.ignore_images = False
		h.body_width = 100000
		text = h.handle(data)
	finally:
		file.close()

	d = {'url': url, 'title': title, 'content': text}
	if cache:
		memcache.add(key=url, value=d, time=600)
	return d
示例#15
0
    def _parse_article(self, response):
        feed_entry = response.meta["feed_entry"]

        il = FeedEntryItemLoader(parent=response.meta["il"])
        try:
            response.text
        except AttributeError:
            # Response is not text (e.g. PDF, ...).
            il.add_value("title", feed_entry.get("title"))
            il.add_value("content_html", feed_entry.get("summary"))
            return il.load_item()

        doc = Document(response.text, url=response.url)
        il.add_value("title", doc.short_title() or feed_entry.get("title"))
        summary = feed_entry.get("summary")
        try:
            content = doc.summary(html_partial=True)
            if summary and len(summary) > len(content):
                # Something probably went wrong if the extracted content is shorter than
                # the summary.
                raise Unparseable
        except Unparseable:
            content = summary
        il.add_value("content_html", content)

        return il.load_item()
示例#16
0
def process(doc):
    html_body = Document(doc)

    summary = html_body.summary()
    title = html_body.short_title()

    text = text_maker.handle(summary)
    return title, text
def main():
    html = urllib.urlopen("http://habrahabr.ru/post/150756/").read()
    doc = Document(html)
    short_title = doc.short_title()
    readable_article = doc.summary()
    f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb")
    f.write(readable_article.encode("utf-8"))
    f.close()
def main():
    html = urllib.urlopen("http://habrahabr.ru/post/150756/").read()
    doc = Document(html)
    short_title = doc.short_title()
    readable_article = doc.summary()
    f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb")
    f.write(readable_article.encode("utf-8"))
    f.close()
示例#19
0
 def get_article_from_item(self, item):
     url = item['link']
     logging.debug(url)
     author = 'n/a'
     if item.has_key('author'):
         author = item.author
     html = urllib.urlopen(url).read()
     doc = Document(html)
     return Article(doc.title(), doc.short_title(), author, doc.summary())
示例#20
0
def process_html(html):
    doc = Document(html)
    return {
        'content': doc.content(),
        'clean_html': doc.get_clean_html(),
        'short_title': doc.short_title(),
        'summary': html_to_text(doc.summary()),
        'title': doc.title()
    }
示例#21
0
 def get_article_from_item(self, item):
     url = item['link']
     logging.debug(url)
     author = 'n/a'
     if item.has_key('author'):
         author = item.author
     html = urllib.urlopen(url).read()
     doc = Document(html)
     return Article(doc.title(), doc.short_title(), author, doc.summary())
示例#22
0
 def extract_data(self, patchurl):
     try:
         f = requests.get(patchurl)
         html = f.content
         doc = Document(html)
         title = doc.short_title()
         summary = doc.summary()
         return smart_str(title), smart_str(summary)
     except:
         return None, None
示例#23
0
 def extract_data(self, patchurl):
     try:
         f = requests.get(patchurl)
         html = f.content
         doc = Document(html)
         title = doc.short_title()
         summary = doc.summary()
         return smart_str(title), smart_str(summary)
     except:
         return None, None
示例#24
0
def extract_by_readability(html):
    document = Document(html)

    def strip_html(html):
        return re.sub(r'<[^<]+?>', '', html)

    return {
        'title': ensure_unicode(document.short_title()),
        'body': strip_html(ensure_unicode(document.summary())),
    }
示例#25
0
def decode_doc(doc, url):
    #print('doc')
    cs = re.compile(b'^<(meta|META).*charset=("|\')?([^ "\']*)')
    pkey = re.compile(b'^<(meta|META).*keywords.*content=("|\')?([^ "\']*)')
    codec = None
    keywords = None
    #print(*doc)
    for l in doc :
        if (l.startswith(b'<meta') or l.startswith(b'<META')) :
            if codec is None and (b'charset' in l) :
                m = cs.match(l)
                codec = m.group(3).decode()
            if keywords is None and b'keywords' in l :
                m = pkey.match(l)
                if m :
                    keywords = m.group(3)


    sdoc = []
    for l in doc :
        try :
            l = l.decode(codec)
        except :
            l = ''
        sdoc.append(l)

    try :
        if keywords :
            keywords = keywords.decode(codec)
        else :
            #print(*sdoc, sep = '\n')
            keywords = ''
        keywords = re.split(r'[ ,;\|]',keywords)
        #print(keywords.encode('utf8'))
    except :
        pass

    #if sum(len(x) for x in sdoc) < 1000 : return
    doc = '\n'.join(sdoc)
    #if len(doc) < 1000 :return
    try :
        doc = Document(doc)
        title = doc.short_title()
        content = doc.summary()
    except :
        return
    #print(doc.summary().encode('utf8'))
    #print(doc.short_title().encode('utf8'))


    data = {"url":url, 
            'keywords':keywords,
            'title': title,
            'content':content}
    return data
示例#26
0
def try_readability():
    html = urllib.request.urlopen(ARTICLE).read()

    doc = Document(html)
    con = BeautifulSoup(doc.summary()).get_text()
    tit = doc.short_title()
    print("===READABILITY===")
    print("=CONTENT=")
    print(con)
    print("=TITLE=")
    print(tit)
示例#27
0
文件: wdzj.py 项目: LightKool/scraper
    def parse_news_content(self, response):
        for link in self.full_article_link_extractor.extract_links(response):
            request = response.request.replace(url=link.url)
            yield request

        item = self._create_item(response)
        if item is not None:
            doc = Document(response.body)
            item['title'] = doc.short_title()
            item['content'] = html2text.html2text(doc.summary())
            yield item
示例#28
0
文件: tasks.py 项目: Daroth/collectr
    def extract_url_content(self, url=None):
        if not url:
            url = self.url
        url_parse = urlparse(url)
        headers = {}
        if url_parse.netloc != "t.co":
            user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1 Iceweasel/9.0.1"
            headers['User-Agent'] = user_agent

        content = requests.get(url, headers=headers)
        self.content_type = content.headers.get('content-type')
        self.status_code = content.status_code
        self.content = content.text
        self.url = self.clean_url(self.url)
        self.url = self.url_morph(content.url)
        self.image = self.find_taller_image(self.content)
        if self.image:
            self.logger.info("found image : %s"%self.image)
        self.url_parse = urlparse(self.url)

        if url_parse.netloc in oembed.keys():
            print "found oembed"
            mod = oembed[url_parse.netloc]
            self.content = mod.get_widget(url)
            self.summary = self.content
            self.title = os.path.basename(url_parse.path)
            self.content_type = "collectr/parsed"
            self.tags = [mod.get_tag()]
            self.tagstring = mod.get_tag()
            return



        if self.status_code >= 400:
            raise UrlExtractException("Can't extract content for %s (http<%d>)" % (url, content.status_code))

        elif "image" in self.content_type:
            print "log: content type : image"
            self.summary = """<img src="%s" />""" % self.url
            self.title = self.url

        elif "html" in self.content_type:
            doc = Document(self.content)
            self.summary = doc.summary()
            try:
                self.title = doc.short_title()
            except AttributeError:
                self.title = u"No title"


        else:
            self.summary = None
            self.title = os.path.basename(url_parse.path)
示例#29
0
def import_html(results, content):
    content = Document(content)

    converter = HTML2Text()
    converter.body_width = 0

    body = content.summary()
    text = BeautifulSoup(body).get_text(" ")

    results.investigation.update(name=content.short_title(),
                                 import_md=converter.handle(body),
                                 import_text=text)
示例#30
0
def extract(html):
    try:
        doc = Document(html)
        article = doc.summary()
        title = doc.short_title()
        return {
            'title': title,
            'article': html_to_text(article),
            'full_text': html_to_text(html)
        }
    except:
        logging.exception('extract html')
        return {}
示例#31
0
文件: ff.py 项目: zeekay/ff
def read_command(args):
    try:
        s_idx, t_idx = (int(x) for x in args.idx.split(':'))
        url = args.session.tabs[s_idx].entries[t_idx].url
    except:
        print 'Invalid index'
        return

    import requests
    from readability.readability import Document
    import html2text
    h = html2text.HTML2Text()
    h.inline_links = False
    h.ignore_images = True
    h.ignore_emphasis = True
    res = requests.get(url)
    if res.ok:
        article = Document(res.content)
        print article.short_title()
        print h.handle(article.summary())
    else:
        print res.headers['status']
示例#32
0
文件: ff.py 项目: zeekay/ff
def read_command(args):
    try:
        s_idx, t_idx = (int(x) for x in args.idx.split(':'))
        url = args.session.tabs[s_idx].entries[t_idx].url
    except:
        print 'Invalid index'
        return

    import requests
    from readability.readability import Document
    import html2text
    h = html2text.HTML2Text()
    h.inline_links = False
    h.ignore_images = True
    h.ignore_emphasis = True
    res = requests.get(url)
    if res.ok:
        article = Document(res.content)
        print article.short_title()
        print h.handle(article.summary())
    else:
        print res.headers['status']
示例#33
0
文件: html.py 项目: raymundl/yeti
def import_html(results, content):
    content = Document(content)

    converter = HTML2Text()
    converter.body_width = 0

    body = content.summary()
    text = BeautifulSoup(body).get_text(" ")

    results.investigation.update(
        name=content.short_title(),
        import_md=converter.handle(body),
        import_text=text)
示例#34
0
def parse(filename):
    html = open(filename, encoding="latin").read()
    doc = Document(html)
    summary = doc.summary()
    summary = re.sub('(<map.*?</map>)', '', summary, re.M)
    summary = re.sub(r"<img.*?usemap=.*?>", '', summary, re.M)
    summary = re.sub(r'<a href="index.html"><img.*?/></a>', '', summary, re.M)
    if 'href="index.html"' in summary:
        raise Exception("FAIIILEEED")

    print("<small>" + doc.short_title() + "</small>")
    print("<p>" + summary + "<p>")
    print("<p class='breakhere'></p>")
示例#35
0
def parse_html(url):
    response = request(url)
    if not response:
        return response

    document = Document(response.content)
    doc = {
        'titulo': document.short_title(),
        'texto': document.summary(),
        'site': urlparse(url).netloc,
        'url': get_object_or_404(Url, url=url),
        'imagem': get_image(response.content, urlparse(url).netloc)
    }
    return doc
示例#36
0
def read_extractor(html, url):
    '''readability extractor'''
    try:
        clean_doc = Document(html,url = url, positive_keywords=",".join(POSITIVE_K) , negative_keywords=",".join(NEGATIVE_K))
    
        #summary = clean_doc.summary()
    
        article = clean_doc.article
        text = re.sub("  |\t", " ",bs(article, "lxml").get_text())
        title = clean_doc.short_title()
    
        return (title, clean_doc, text)
    except Exception as e:
        return False
示例#37
0
def extractMainArticle(html):
    p = Document(html)
    readable_article = p.summary()
    readable_title = p.short_title()

    soup = BeautifulSoup(readable_article)
    text_nodes = soup.findAll(text=True)
    text = ''.join(text_nodes)

    #text = readable_title + " " + text
    #return text

    wtext = {"title": readable_title, "text": text}
    return wtext
示例#38
0
def extractMainArticle(html):
    p = Document(html)
    readable_article = p.summary()
    readable_title = p.short_title()
    
    soup = BeautifulSoup(readable_article)
    text_nodes = soup.findAll(text=True)
    text = ''.join(text_nodes)
    
    #text = readable_title + " " + text
    #return text
    
    wtext = {"title":readable_title, "text": text}
    return wtext
示例#39
0
def parse_with_readability(html):
    """
    Return

        {
            'title': '',
            'summary': ''
        }
    """
    doc = Document(html)
    return {
        'title': doc.short_title(),
        'summary': doc.summary(html_partial=True)
    }
示例#40
0
class WebInfoExtractor:
    def __init__(self, url):
        self.url = url
        self.readable = None
        self.page = None

        try:
            head = requests.head(url, headers=HEADERS)
        except requests.exceptions.RequestException:
            return

        if 'text/html' in head.headers.get('content-type', ''):
            try:
                html = requests.get(url, headers=HEADERS).text
            except requests.exceptions.RequestException:
                return

            self.readable = Document(html)
            self.page = BeautifulSoup(html, 'lxml')

    def get_title(self):
        return self.readable and self.readable.short_title()

    def get_date(self):
        return extract_date(self.url, self.page)

    def get_country(self):
        if not self.page:
            return None
        country = self.page.select('.primary-country .country a')
        if country:
            return country[0].text.strip()

        country = self.page.select('.country')
        if country:
            return country[0].text.strip()

        return None

    def get_source(self):
        if self.page:
            source = self.page.select('.field-source')
            if source:
                return source[0].text.strip()

        return tldextract.extract(self.url).domain

    def get_website(self):
        return urlparse(self.url).netloc
示例#41
0
 def _parse_article(self, response):
     feed_entry = response.meta["feed_entry"]
     il = FeedEntryItemLoader(parent=response.meta["il"])
     doc = Document(response.text, url=response.url)
     il.add_value("title", doc.short_title() or feed_entry.get("title"))
     summary = feed_entry.get("summary")
     try:
         content = doc.summary(html_partial=True)
         if summary and len(summary) > len(content):
             # Something probably went wrong if the extracted content is shorter than
             # the summary.
             raise Unparseable
     except Unparseable:
         content = summary
     il.add_value("content_html", content)
     return il.load_item()
示例#42
0
def getDoc2(url):
    t = time.time()
    # import urllib
    # html = urllib.urlopen(url).read()
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'}
    r = get(url,headers=headers)
    html = r.content

    doc = Document(html,url=url)
    readable_article = doc.summary()
    readable_title = doc.short_title()
    readable_article = readable_article.replace("http","/?url=http")
    timeElapsed = int((time.time()-t)*1000)
    fileSize = 0.7 + float(sys.getsizeof(readable_article)/1000.0)
    fileSize = round(fileSize,1)
    return {'title':readable_title,'description':"",'url':url,'timeElapsed':timeElapsed,'content':readable_article,'size':fileSize}
示例#43
0
def make_readable(url):
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError:
        return None

    document = Document(html)

    document_dict = {
        'title': document.title(),
        'summary': document.summary(),
        'content': document.content(),
        'short_title': document.short_title()
    }

    return document_dict
示例#44
0
def scrape(result):
    start_url = result.url
    response = requests.get(start_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    document = Document(response.content)
    body_soup = BeautifulSoup(document.summary(), 'html.parser')
    links = [Link(url=el['href']) for el in soup.select('a') if el.get('href')]
    for link in links:
        link.save()
    content = Content(
        result=result,
        html=soup.prettify(),
        title=document.short_title(),
        body_html=body_soup.prettify(),
        body=body_soup.get_text().strip(),
    )
    content.save()
    content.links.add(*links)
示例#45
0
def main():  
    url = sys.argv[1]
    #url = 'http://jandan.net/2016/01/04/safe-sex-education.html'
    config = get_login_info()
    apikey = config['apikey']
    
    html_text = getHtml(url)
    doc = Document(html_text)
    readable_article = doc.summary()
    readable_title = doc.short_title()
    soup = BeautifulSoup(readable_article)
    final_article = soup.text
    print "原文:"
    print final_article
    eng_article = baidu_translate(apikey, final_article, 'zh', 'en')
    print "英文:"
    print eng_article
    zh_article_back = baidu_translate(apikey, eng_article, 'en', 'zh')
    print "中文:"
    print zh_article_back
示例#46
0
def fetch_privacy_policy(policy_url):
    print('fetch_privacy_policy', policy_url)

    # Extract domain
    ext = tldextract.extract(policy_url)
    domain = ext.domain
    suffix = ext.suffix
    registered_domain = ext.registered_domain
    print('domain', domain)

    # Fetch policy page
    print('Fetch policy', policy_url)
    content = fetch(policy_url)

    if not content:
        return

    lowered = content.lower()
    # if not any(keyword in KEYWORDS for keyword in lowered.split()):
    #     print('No keyword found')
    #     return
    if len(lowered) < 1600:
        print('Too short:', len(content))
        return

    # Extract content
    readability = Document(content)
    title = readability.short_title()
    clean_content = readability.summary()
    lang = langdetect.detect(clean_content)

    return {
        "raw_content": content,
        "clean_content": clean_content,
        "lang": lang,
        "title": title,
        "url": policy_url,
        "suffix": suffix,
        "registered_domain": registered_domain
    }
    def extract(self, response, link): #extract

#    for link in link_list:
    #       response = ulib.urlopen(link).read()
            
        #get relevant content using readability
        readable = Document(response)
        body = readable.summary()
        title = readable.short_title()

        #strip extra html readability leaves in, like p tags
        title = html.fromstring(title).text_content()
        body = html.fromstring(body).text_content()
        title = condense_whitespace(title)
        body = condense_whitespace(body)

        links = self.extra['links']
       
        try: 
            d = unicode(self.extra['dates'][links.index(link)])
        except:
            #pr web rss feeds don't have pubdate
            html_body = html.fromstring(response)
            d = re.sub('.*\(.*\)', '', html_body.find_class('releaseDateline')[0].text_content())

        #print d
        
        try:                
            date = parse(d)
        except:
            date = datetime.now()

        doc = { 'url': link,
                'title': title,
                'text': body,
                'date': date,
                'source': self.extra['source'][links.index(link)]}

        return doc
示例#48
0
def evernotify(html, url):
  doc = Document(html, url=url)
  html = doc.summary()
  allowed_tags = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'map', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'tt', 'u', 'ul', 'var', 'xmp']
  disallowed_attrs = ['id', 'class', 'onclick', 'ondblclick', 'accesskey', 'data', 'dynsrc', 'tabindex', 'content']
  soup = bs4(html)
  body = soup.body
  body.name = "en-note"
  pid = 0
  for tag in body.find_all(lambda b: True, recursive=True):
    if tag.name not in allowed_tags:
      tag.name = "span"
    for attr in filter(lambda d: tag.attrs.get(d, False), disallowed_attrs):
      del(tag[attr])
    for attr in filter(lambda a: a.startswith('item'), tag.attrs.keys()):
      del(tag[attr])
  body = body.prettify()
  body = '<?xml version="1.0" encoding="UTF-8"?>\n\
      <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">\n%s' % body
  body = body.encode('utf-8')
  print body
  return doc.short_title(), body
示例#49
0
    def simplify(self):
        if not self.doc:
            raise StripError("Not a html document")

        html_body = Document(self.doc)
        summary = html_body.summary()
        title = html_body.short_title()
        images = []

        for img in html_body.reverse_tags(html_body.html, 'img'):
            try:
                fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR)
                r = requests.get(img.get('src'), stream=True)
                write_file(r, fp)
                images.append(fp)
            except Exception:
                pass

        html = "<h1>" + title + "</h1>" + summary

        regex = re.compile('\n*', flags=re.IGNORECASE)
        html = regex.sub('', html)
        return html, images
示例#50
0
def process(doc):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR)
            r = requests.get(img.get('src'), stream=True)
            write_file(r, fp)
            images.append(fp)
        except Exception:
            pass

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images
示例#51
0
文件: extract.py 项目: yujiye/Codes
def extractTitle(html):
    if html == "":
        return None
    try:
        doc = Document(html)
        short_title = doc.short_title()
        title = doc.title()
        if short_title is not None and short_title.strip() != "":
            title = short_title

        for delimiter in ['|', '-', '::', '/', '_']:
            if delimiter in title:
                parts = title.split(delimiter)
                if len(parts[0]) >= 4:
                    title = parts[0]
                    break
                elif len(parts[-1]) >= 4:
                    title = parts[-1]
                    break

        return title
    except:
        pass
    return None
示例#52
0
 def parse(self, response):
     if self.max_crawl < 1:
         return
     # response = scrapy.http.TextResponse(response)
     doc = Document(response.text)
     title = doc.short_title()
     content = pq(doc.summary()).text()
     if len(content) > 150:
         item = Scrapy1Item()
         item["content"] = content #.encode("utf-8")
         item["title"] = title #.encode("utf-8")
         item["url"] = response.url
         item["crawler_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         self.max_crawl -= 1
         yield item
     for url in response.xpath("//a/@href").extract():
         if "javascript" in url and "http" not in url:
             continue
         url=urllib.parse.urljoin(get_base_url(response), url)
         host = urlparse(url).netloc
         if host not in self.host_list:
             continue
         print("crawl {}".format(url))
         yield scrapy.Request(url, callback=self.parse)
示例#53
0
import yql
import urllib

con = Connection()
db = con.learner
news = db.news
yqlpublic = yql.Public()
rsscfg = open("rss.cfg", "r")
for line in rsscfg.readlines():
    category = line.split(":")
    query = 'select link,pubDate from rss where url ="http://rss.news.yahoo.com/rss/%s"' % category[0]
    results = yqlpublic.execute(query)
    for result in results.rows:
        html = urllib.urlopen(result["link"]).read()
        doc = Document(html)
        news.insert(
            {
                "art_id": 0,
                "content": doc.summary(),
                "description": "",
                "hardness": 0,
                "pub_date": result["pubDate"],
                "ranking": 0,
                "rss_id": category[1],
                "time_index": 1,
                "tiny_image": "",
                "title": doc.short_title(),
                "url": result["link"],
            }
        )
def extract_with_python_readability(raw_content):
    doc = Document(raw_content)

    return [u'' + doc.short_title().strip(), u'' + doc.summary().strip()]
 def extract(self, body, url):
     doc = Document(body)
     title = doc.short_title()
     content = doc.summary()
     self.save_article(url, title, content)
     self.logger.info('Article extract, title=%s, content=%s' % (title, content[:100]))
示例#56
0
class Page(object):
    """
        Basic Page
    """
    def __init__(self, item, config):
        """
        Mapping item and config
        """
        for k, v in item.items():
            if k in ["url", "source_url", "depth", "type"]:
                setattr(self, k, v) 
        
        
        for k, v in config.items():
            if k in ['filter_lang','max_depth', "query", "directory", "filter", "short_export", "date"]:
                setattr(self, k, v) 
            
        ##logger.debug("Page Init")
        
        self.status = True
        self.load_default()
        
        
    
    
    def load_default(self):
        self.msg = ""
        self.code = 100
        self.status = True
        return self

        
    
    def process(self, filter_text=True):
        
        self.check_depth()
        
        self.valid_url()
        
        self.fetch()
        
        self.clean_article()
        
        self.extract()
        
        self.check_lang()
        
        if filter_text:
            if self.filter is not False:
                self.filter_text()
        return self.status
        
    @check_status
    #@debug
    def check_depth(self):
        '''checking depth'''
        if self.depth is False or self.depth is None:
            self.depth = 0
            
        #logger.debug("Page check depth")
        if self.depth > self.max_depth:
            self.code = "102"
            self.msg = "Depth exceed max_depth for page" %(self.max_depth)
            self.status = False
            return self.status
        else:
            return self
    
    @check_status
    #@debug
    def valid_url(self):
        '''checking url format and validity'''
        for k, v in self.parse_link(self.url).items():
            if k is None:
                continue
            
                
            if v is None or v == "":
                setattr(self, k, "")
            else:
                setattr(self, k, v)
                
        
        
        try:
            if self.scheme not in ACCEPTED_PROTOCOL:
                self.msg = 'URL: Wrong protocol %s' % self.scheme
                self.status = False
                self.code = 804
                return self.status
        except Exception as e:
            logging.warning("%s" %str(e))
            pass
        try:
            if self.filetype in BAD_TYPES:
                self.msg = 'URL: Invalid webpage type %s' % self.filetype
                self.status = False
                self.code = 806
                return self.status
        except Exception as e:
            pass
        try:
            if self.domain in BAD_DOMAINS:
                self.msg = 'URL: Bad domain %s' % self.domain
                self.status = False
                self.code = 807
                return self.status
        except Exception as e:
            logging.warning("%s" %str(e))
            pass
        try:
            if self.subdomain in BAD_DOMAINS:
                self.msg = 'URL: Bad subdomain %s' % self.subdomain
                self.status = False
                self.code = 807
                return self.status
        except Exception as e:
            logging.warning("%s" %str(e))
            pass
        try:
            if self.path in BAD_PATHS:
                self.msg = 'URL: Bad path %s' % self.path
                self.status = False
                self.code = 807
                return self.status
        except Exception as e:
            logging.warning("%s" %str(e))
            pass
            
        if filter.match(self.url):
            self.msg = 'URL: Blacklisted url'
            self.status = False
            self.code = 808
            return self.status

        return self
        
    
    @check_status
    def fetch(self):
        '''downloading page'''
        try:
            req = requests.get(self.url, allow_redirects=True, timeout=3)
            req.raise_for_status()
            try:
                self.html = req.text
                self.content_type = req.headers['content-type']
                if 'text/html' not in self.content_type:
                    self.msg ="Control: Content type is not TEXT/HTML"
                    self.code = 404
                    self.status = False
                    return self.status
            #Error on ressource or on server
                elif req.status_code in range(400,520):
                    self.code = int(req.status_code)
                    self.msg = "Control: Request error on connexion no ressources or not able to reach server"
                    self.status = False
                    return self.status
                else:
                    if self.html == "" or self.html is None:
                        self.msg = "Error loading HTML from request"
                        self.code = 405
                        self.status = False
                        return self.status
                    try:
                        self.html = self.html
                        self.tree = lxml.html.document_fromstring(self.html)
                        #cleaning with lxml it's fun!
                        self.tree = cleaner.clean_html(self.tree)
                        self.tree.make_links_absolute(self.url)
                        self.doc = lxml.html.tostring(self.tree)
                        self.doc = (self.doc).replace(unichr(160), " ")
                        self.doc = re.sub(re.compile("\r+|\n+|\t+|\s+")," ",self.doc)
                        if self.doc == "" or self.doc is None:
                            self.msg = "Error loading HTML from request"
                            self.code = 405
                            self.status = False
                            return self.status
                        else:
                            return self
                        
                    except Exception as e:
                        self.msg = "Error loading HTML: "+str(e)
                        self.code = 405
                        self.status = False
                        return self.status

            except Exception as e:
                self.msg = "Requests: answer was not understood %s" %e
                self.code = 400
                self.status = False
                return self.status
                
        except Exception as e:
            #logger.warning(e)
            self.msg = "Incorrect link url"
            try:
                self.code = req.status_code
                self.status = False
                return self.status
            except Exception as e:
                self.code = 400
                self.status = False
                return self.status
    @check_status
    def clean_article(self):
        
        try:
            self.clean_doc = Document(self.doc,url = self.url, positive_keywords= "entry-content,post,main,content,container,blog,article*,post,entry", negative_keywords="like*,ad*,comment.*,comments,comment-body,about,access,navigation, sidebar.*?,share.*?,relat.*?,widget.*?")
            self.article = self.clean_doc.summary()
            self.text = re.sub("  |\t", " ",bs(self.article).get_text())
            self.title = self.clean_doc.short_title()
            if self.text == "" or self.text == u'':
                self.msg = "Error extracting Article and cleaning it"
                self.code = 700
                self.status = False
                return self.status
            if self.title == '':
                self.title = u''
            return self
        except AttributeError as e:
            self.msg = "Error loading HTML: %s" %str(e)
            self.code = 400
            self.status = False
            return self.status
        
    @check_status
    #@debug
    def extract(self):
        '''extracting info from page'''
        if self.doc is not None:
            
            links = list(set([n.get('href') for n in bs(self.article).find_all("a")]))
            links = [n for n in links if n != self.url]
            #get links, cited_links, cited_links_ids, cited_domains
            self.outlinks = self.parse_outlinks(links)
            self.get_meta()
            return self
        else:
            #~ #self.msg = str(#logger.debug("ParserError"))
            self.msg = "Extract Error"
            self.code = 701
            self.status = False
            return self.status
    
    def parse_link(self, url):
        '''parsing link info'''
        link = {"url":url}
        
        parsed_url = urlparse(url)
        for k in ["scheme", "netloc", "path", "params", "query", "fragment"]:
            if k == "query":
                link["url_query"] = getattr(parsed_url,k)
            else:
                link[k] = getattr(parsed_url,k)
                
        tld_dat = tldextract.extract(url)
        for k in ["domain", "subdomain", "suffix"]:
            link[k] = getattr(tld_dat,k)
        #~ link["subdomain"] = tld_dat.subdomain
        #~ link["domain"] = tld_dat.domain.lower()
        if link["subdomain"] not in ["www", "ww1", "ww2", ""]:
            link["url_id"] = link["subdomain"]+"_"+link["domain"]
        else:
            link["url_id"] = link["domain"]
            
        link["extension"] =  link["suffix"]
        del link["suffix"]
        link["chunks"] = [x for x in link["path"].split('/') if len(x) > 0]
        link["internal_depth"] = len(link["chunks"])
        link["filetype"] = re.split(".", link['netloc'])[-1]                
        return link
        
    
    def parse_outlinks(self, links):
        '''creating outlinks from page'''
        self.links = [self.parse_link(url) for url in set(links) if url is not None and url != ""]
        
        self.cited_links = [n["url"] for n in self.links]
        self.cited_links_ids = [n["url_id"] for n in self.links]
        self.cited_domains = [n["domain"] for n in self.links]
        self.outlinks = [{"url": n["url"], "url_id":n["url_id"], "source_url": self.url, "depth": self.depth+1, "type":"page"} for n in self.links]
        return self.outlinks
        
    def get_meta(self):
        self.generators = []
        self.meta = {}
        for n in bs(self.doc).find_all("meta"): 
            name = n.get("name")
            prop = n.get("property")
            content = n.get("content")
            if name is not None and name not in ["type", "viewport"]:
                if name.lower() in ["generator"]:
                    self.generators.append(content)
                else:
                    self.meta[re.sub("og:|DC.", "", name)] = content
                #~ 
            if prop is not None:
                self.meta[re.sub("og:|DC.", "", prop)] = content
        try:
            self.keywords = self.meta["keywords"]
        except KeyError:
            self.keywords = [""]
        return self.meta 
    
    

    @check_status
    #@debug
    def check_lang(self):
        '''checking lang'''
        try:
            self.lang = detect(self.text)
        except Exception as e:
            logging.warning("No lang detected in article")
            try:
                self.lang = detect(self.title)
            except Exception as e:
                logging.warning("No lang detected in title")
                self.lang = None
                
        if self.filter_lang is not False:
            if self.lang == self.filter_lang:
                return self
            else:
                self.status = False
                return self.status

    
    @check_status
    #@debug
    def filter_text(self):
        '''filter_text: checking relevancy'''
        
        q = Query(self.query, self.directory)
        #print "Debug query", q.query
                
        doc = {"content": self.text, "title": self.title}
        
        relevant = q.match(doc)
        
        if relevant is False:
        
            self.code = 800
            self.msg = "Article Query Filter: text not relevant"
            self.status = False
            return self.status
        else:
            self.status = True
            return self
    
    def format_export(self):
        '''format export'''
        #for n in ["url_id","url", "cited_links", "cited_links_ids","source_url", "cited_domains", "title", "text", "keywords", "generators", "extension", "filetype", "depth", "crawl_nb", "status", "msg", "date", "code", "nb", "total"]:
        pass
    #@debug
    def set_data(self):
        '''Set data : creating default page info'''
        data = {}
        for n in ["date", "url", "url_id","url", "cited_links", "cited_links_ids","source_url", "cited_domains", "title", "text","html", "keywords", "generators", "extension", "filetype", "depth", "crawl_nb", "status", "msg", "date", "code", "lang"]:
            #unique info
            if n in ["url_id","url","extension", "filetype", "depth", "crawl_nb", "source_url", "type", "lang"]:
                if n in ["type"]:
                    if self.depth == 0:
                        data["type"] = "source"
                    else:
                        if self.status is True:
                            data["type"] = "page"
                        else:
                            data["type"] = "log"
                else:        
                    try:
                        #conserver le type de donnée
                        data[n] = self.__dict__[n]
                    except KeyError:
                        if n in ["crawl_nb", "depth"]:
                            data[n] = 0
                        else:
                            data[n] = None
            #multiple info
            else:
                try:
                    data[n] = [self.__dict__[n]]
                    
                except KeyError:
                    data[n] = [None]
        #meta_data
        #~ for k, v in self.meta.items():
            #~ data["meta_"+k] = v
        return data
    
    #@debug
    def add_data(self):
        '''Add data : updating values of page_info adding contextual info to existing'''
        data = {}
        for n in ["cited_links", "cited_links_ids", "cited_domains", "title", "text","html", "keywords", "generators", "status", "code", "msg", "date"]:
            try:
                data[n] = {"$each":[self.__dict__[n]], "$position":0}
            
            except KeyError:
                data[n] = {"$each":[None], "$position":0}
            
        return data
                
    def get_status(self):
        data = {}
        for k in ["status", "date", "code", "msg"]:
            try:
                data[k] = self.__dict__[k]
            except KeyError:
                data[k] = None
        return data
示例#57
0
class Gist:

    keyword_pattern = re.compile(r"^[^\d]+$")
    stop_words = set(get_stop_words("en"))

    def __init__(self, html):
        self.html = html
        self.document = Document(html)

    @property
    def title(self):
        return self.document.short_title()

    @cached_property
    def text(self):
        text = self.document.summary()
        text = re.sub("<br[^>]+>", "\n", text)
        text = re.sub("</?p[^>]+>", "\n\n", text)
        text = re.sub("<[^>]+>", "", text)
        text = re.sub("^[ \t]+$", "", text)
        text = re.sub("\n{3,}", "\n\n", text, flags=re.MULTILINE)
        return text

    @staticmethod
    def _common_prefix(one, two):
        parallelity = [x == y for x, y in zip(one, two)] + [False]
        return parallelity.index(False)

    @classmethod
    def _find_representative(cls, stem, text):
        tokens = text.split()
        prefixes = {token: cls._common_prefix(token, stem) for token in tokens}
        best = lambda token: (-token[1], len(token[0]))
        return sorted(prefixes.items(), key=best)[0][0]

    @classmethod
    def _is_good_keyword(cls, word):
        return (word not in cls.stop_words) and cls.keyword_pattern.match(word)

    @classmethod
    def find_keywords(cls, text):
        whoosh_backend = SearchForm().searchqueryset.query.backend
        if not whoosh_backend.setup_complete:
            whoosh_backend.setup()
        with whoosh_backend.index.searcher() as searcher:
            keywords = searcher.key_terms_from_text("text",
                                                    text,
                                                    numterms=10,
                                                    normalize=False)
        keywords = list(zip(*keywords))[0] if keywords else []
        keywords = [
            cls._find_representative(keyword, text) for keyword in keywords
        ]
        keywords = [
            keyword for keyword in keywords if cls._is_good_keyword(keyword)
        ]
        # no double keywords in list
        keywords = list(set(keywords))
        # no punctuation in suggested keywords
        keywords = [
            "".join(c for c in s if c not in string.punctuation)
            for s in keywords
        ]
        return keywords

    @property
    def keywords(self):
        return self.find_keywords(self.text)
示例#58
0
# encoding:utf-8
# import html2text
import requests
import time
import re
from readability.readability import Document

url = "http://world.huanqiu.com/exclusive/2016-07/9209839.html"
# res = requests.get('http://finance.sina.com.cn/roll/2019-02-12/doc-ihrfqzka5034116.shtml')
res = requests.get(url)

st = time.time()
d = Document(res.content)

# 获取新闻标题
readable_title = d.short_title()
print(readable_title)
# 获取内容并清洗
readable_article = d.summary()
# print(readable_article)

print(d.get_clean_html())

print("time: {}".format(time.time() - st))

# text_p = re.sub(r'</?div.*?>', '', readable_article)
# text_p = re.sub(r'((</p>)?<a href=.*?>|</a>(<p>)?)', '', text_p)
# text_p = re.sub(r'<select>.*?</select>', '', text_p)
# print(text_p)
示例#59
0
文件: read.py 项目: uranther/kipy
class ReadHtml:
    """
    Extract main article content from a webpage

    @see http://github.com/buriy/python-readability
    """
    def __init__(self, url):
        self.url = url
        self.html = self.get_html()
        self.text = ReadHtml.strip_tags(self.html)

        try:
            self.doc = Document(self.html)
        except TypeError as e:
            log.error('{}: {}'.format(
                self.url,
                e
            ))

    # Special cases for URLs at these domains:
    #   news.ycombinator.com
    #   slashdot.org
    #   reddit.com
    #   - Attempt to download all linked pages

    # If the bookmark is at the root level (/)
    # mirror the entire site

    # Attempt to find the Hacker News comments thread

    # Special cases for images and PDFs
    #   - use ReadImg and ReadPdf instead

    def get_html(self):
        """
        Download the HTML of the URL
        """
        try:
            return self.html
        except AttributeError:
            self.request = urllib.request.urlopen(self.url)
            self.html_bytes = self.request.read()
            try:
                return self.html_bytes.decode('utf-8')
            except UnicodeDecodeError:
                return self.html_bytes.decode('iso-8859-1')

    def get_http_code(self):
        return int(self.request.getcode())

    def get_http_headers(self):
        return ReadHtml.convert_http_headers(
            self.request.info().items()
        )

    @staticmethod
    def convert_http_headers(items):
        headers = {}
        for header in items:
            headers[header[0]] = header[1]
        return headers

    def get_content(self):
        """
        Get the readable main content from a webpage
        """
        try:
            return self.doc.summary()
        except (TypeError, AttributeError, Unparseable) as e:
            log.error('{}: {}'.format(
                self.url,
                e
            ))
            return ''

    def get_title(self):
        """
        Get the "readable title" from a webpage
        """
        try:
            return self.doc.short_title()
        except (TypeError, AttributeError, Unparseable) as e:
            log.error('{}: {}'.format(
                self.url,
                e
            ))
            return ''

    def get_text(self):
        """
        Return the extracted text from the webpage
        """
        return self.text

    def get_links(self):
        """
        Return a list of URL strings captured from the article body
        """
        pass

    def is_article(self):
        """
        Can the webpage be read as an article?

        TODO: This can be more sophisticated, such as checking for
        <p> tags, setting a higher threshold for number of characters

        TODO: Find out if Readability will return this value in the 
        API, even though it misses a lot
        """
        if len(self.text) > 0:
            return True

        return False

    @staticmethod
    def strip_tags(html_string):
        """
        Strip tags and extract the text from the HTML string
        @see http://stackoverflow.com/a/925630
        """
        s = MLStripper()
        s.feed(str(html_string))
        return s.get_data()