Пример #1
0
 def __init__(self, id, title, url, author, summary, published, content):
     from lxml import html
     self.downloaded = False
     self.id = id
     if not title or not isinstance(title, string_or_bytes):
         title = _('Unknown')
     title = force_unicode(title, 'utf-8')
     self._title = clean_xml_chars(title).strip()
     try:
         self._title = re.sub(r'&(\S+?);',
             entity_to_unicode, self._title)
     except:
         pass
     self._title = clean_ascii_chars(self._title)
     self.url = url
     self.author = author
     self.toc_thumbnail = None
     self.internal_toc_entries = ()
     if author and not isinstance(author, str):
         author = author.decode('utf-8', 'replace')
     if summary and not isinstance(summary, str):
         summary = summary.decode('utf-8', 'replace')
     summary = clean_xml_chars(summary) if summary else summary
     self.summary = summary
     if summary and '<' in summary:
         try:
             s = html.fragment_fromstring(summary, create_parent=True)
             summary = html.tostring(s, method='text', encoding='unicode')
         except:
             print('Failed to process article summary, deleting:')
             print(summary.encode('utf-8'))
             traceback.print_exc()
             summary = ''
     self.text_summary = clean_ascii_chars(summary)
     self.author = author
     self.content = content
     self.date = published
     self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
     self.localtime = self.utctime.astimezone(local_tz)
     self._formatted_date = None
Пример #2
0
 def __init__(self, id, title, url, author, summary, published, content):
     from lxml import html
     self.downloaded = False
     self.id = id
     if not title or not isinstance(title, string_or_bytes):
         title = _('Unknown')
     title = force_unicode(title, 'utf-8')
     self._title = clean_xml_chars(title).strip()
     try:
         self._title = re.sub(r'&(\S+?);',
             entity_to_unicode, self._title)
     except:
         pass
     self._title = clean_ascii_chars(self._title)
     self.url = url
     self.author = author
     self.toc_thumbnail = None
     if author and not isinstance(author, unicode_type):
         author = author.decode('utf-8', 'replace')
     if summary and not isinstance(summary, unicode_type):
         summary = summary.decode('utf-8', 'replace')
     summary = clean_xml_chars(summary) if summary else summary
     self.summary = summary
     if summary and '<' in summary:
         try:
             s = html.fragment_fromstring(summary, create_parent=True)
             summary = html.tostring(s, method='text', encoding=unicode_type)
         except:
             print('Failed to process article summary, deleting:')
             print(summary.encode('utf-8'))
             traceback.print_exc()
             summary = u''
     self.text_summary = clean_ascii_chars(summary)
     self.author = author
     self.content = content
     self.date = published
     self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
     self.localtime = self.utctime.astimezone(local_tz)
     self._formatted_date = None