Пример #1
0
    def list(self, event, number, full, plurality, name, start):
        full = full == 'article'
        if number:
            number = int(number)
        elif not plurality:
            number = 1
        else:
            number = 10
        start = start and int(start) or 0

        feed = event.session.query(Feed).filter_by(name=name).first()

        if not feed:
            event.addresponse(u"I don't know about the %s feed", name)
            return

        feed.update()
        if not feed.entries:
            event.addresponse(u"I can't find any articles in that feed")
            return

        articles = feed.entries[start:number + start]
        entries = []
        for article in articles:
            if full:
                if 'summary' in article:
                    summary = html2text_file(article.summary, None)
                else:
                    if article.content[0].type in \
                            ('application/xhtml+xml', 'text/html'):
                        summary = html2text_file(article.content[0].value,
                                                 None)
                    else:
                        summary = article.content[0].value

                entries.append(
                    u'%(number)s: "%(title)s"%(link)s : %(summary)s' % {
                        'number': articles.index(article) + 1,
                        'title': html2text_file(article.title, None).strip(),
                        'link': get_link(article),
                        'summary': summary,
                    })
            else:
                entries.append(u'%s: "%s"' %
                               (feed.entries.index(article) + 1,
                                html2text_file(article.title, None).strip()))
        event.addresponse(u', '.join(entries))
Пример #2
0
def html2fmt(html, target_format):
    #   html = html.replace("\n\n", '<br/><br/>')
    #   html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[')
    #   html = html.replace('</pre>', ']]></pre>')
    if target_format == 'html':
        return html
    else:
        return html2text_file(html, None)
Пример #3
0
def html2fmt(html, target_format):
    #   html = html.replace("\n\n", '<br/><br/>')
    #   html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[')
    #   html = html.replace('</pre>', ']]></pre>')
    if target_format == 'html':
        return html
    else:
        return html2text_file(html, None)
Пример #4
0
    def article(self, event, number, pattern, name):
        feed = event.session.query(Feed).filter_by(name=name).first()

        if not feed:
            event.addresponse(u"I don't know about the %s feed", name)
            return

        feed.update()
        if not feed.entries:
            event.addresponse(u"I can't access that feed")
            return
        article = None

        if number:
            if int(number) > len(feed.entries) or 1 > int(number):
                event.addresponse(u"That's old news dude")
                return
            article = feed.entries[int(number) - 1]

        else:
            pattern = re.compile(pattern, re.I)
            for entry in feed.entries:
                if pattern.search(entry.title):
                    article = entry
                    break

            if not article:
                event.addresponse(u'Are you making up news again?')
                return

        if 'summary' in article:
            summary = html2text_file(article.summary, None)
        else:
            if article.content[0].type in \
                    ('application/xhtml+xml', 'text/html'):
                summary = html2text_file(article.content[0].value, None)
            else:
                summary = article.content[0].value

        event.addresponse(
            u'"%(title)s" %(link)s : %(summary)s', {
                'title': html2text_file(article.title, None).strip(),
                'link': article.link,
                'summary': summary,
            })
Пример #5
0
def html2fmt(html, target_format):
#    html = html.replace("\n\n", '<br/><br/>')
 #   html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[')
 #   html = html.replace('</pre>', ']]></pre>')
    if target_format=='html':
        return html
    else:
        # This is like very stupid but I was having troubles with unicode encodings and process.POpen
        return html2text_file(html, None)
Пример #6
0
def html2fmt(html, target_format):
    #    html = html.replace("\n\n", '<br/><br/>')
    #   html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[')
    #   html = html.replace('</pre>', ']]></pre>')
    if target_format == 'html':
        return html
    else:
        # This is like very stupid but I was having troubles with unicode encodings and process.POpen
        return html2text_file(html, None)
Пример #7
0
    def article(self, event, number, pattern, name):
        feed = event.session.query(Feed).filter_by(name=name).first()

        if not feed:
            event.addresponse(u"I don't know about the %s feed", name)
            return

        feed.update()
        if not feed.entries:
            event.addresponse(u"I can't find any articles in that feed")
            return
        article = None

        if number:
            if int(number) > len(feed.entries) or 1 > int(number):
                event.addresponse(u"That's old news dude")
                return
            article = feed.entries[int(number) - 1]

        else:
            pattern = re.compile(pattern, re.I)
            for entry in feed.entries:
                if pattern.search(entry.title):
                    article = entry
                    break

            if not article:
                event.addresponse(u'Are you making up news again?')
                return

        if 'summary' in article:
            summary = html2text_file(article.summary, None)
        else:
            if article.content[0].type in \
                    ('application/xhtml+xml', 'text/html'):
                summary = html2text_file(article.content[0].value, None)
            else:
                summary = article.content[0].value

        event.addresponse(u'"%(title)s"%(link)s : %(summary)s', {
            'title': html2text_file(article.title, None).strip(),
            'link': get_link(article),
            'summary': summary,
        })
Пример #8
0
    def list(self, event, number, full, plurality, name, start):
        full = full == 'article'
        if number:
            number = int(number)
        elif not plurality:
            number = 1
        else:
            number = 10
        start = start and int(start) or 0

        feed = event.session.query(Feed).filter_by(name=name).first()

        if not feed:
            event.addresponse(u"I don't know about the %s feed", name)
            return

        feed.update()
        if not feed.entries:
            event.addresponse(u"I can't find any articles in that feed")
            return

        articles = feed.entries[start:number+start]
        entries = []
        for article in articles:
            if full:
                if 'summary' in article:
                    summary = html2text_file(article.summary, None)
                else:
                    if article.content[0].type in \
                            ('application/xhtml+xml', 'text/html'):
                        summary = html2text_file(article.content[0].value, None)
                    else:
                        summary = article.content[0].value

                entries.append(u'%(number)s: "%(title)s"%(link)s : %(summary)s' % {
                    'number': articles.index(article) + 1,
                    'title': html2text_file(article.title, None).strip(),
                    'link': get_link(article),
                    'summary': summary,
                })
            else:
                entries.append(u'%s: "%s"' % (feed.entries.index(article) + 1, html2text_file(article.title, None).strip()))
        event.addresponse(u', '.join(entries))
Пример #9
0
    def list(self, event, number, name, start):
        number = number and int(number) or 10
        start = start and int(start) or 0

        feed = event.session.query(Feed).filter_by(name=name).first()

        if not feed:
            event.addresponse(u"I don't know about the %s feed", name)
            return

        feed.update()
        if not feed.entries:
            event.addresponse(u"I can't find any articles in that feed")
            return

        articles = feed.entries[start:number+start]
        articles = [u'%s: "%s"' % (feed.entries.index(entry) + 1,
                                   html2text_file(entry.title, None).strip())
                    for entry in articles]
        event.addresponse(u', '.join(articles))
Пример #10
0
 def do_VIEW_DETAILS_OR_NOT(self,command_name,details,*args) :
     for position in self.get_selection_list(args) :
         self._print('\n')
         self._print('='*3+' [ '+('%3d'%(position+1,))+' ] '+'='*58+'\n')
         self._print(self.format_title(self._rss_reader.get_title(position))+'\n')
         self._print('-'*65+'\n')
         if details :
             link = self._rss_reader.get_link(position)
             if link and link != '' :
                 self._print(link+'\n')
                 self._print('-'*65+'\n')
         self._print(html2text_file(self._rss_reader.get_content(position),None))
         if details :
             categories = self._rss_reader.get_categories(position)
             if categories is not None :
                 if len(categories)>0 :
                     self._print('-'*65+'\n')
                 for categorie in categories :
                     self._print('  %s\n' % categorie)
                 if len(categories)>0 :
                     self._print('-'*65+'\n')
Пример #11
0
    def list(self, event, number, name, start):
        number = number and int(number) or 10
        start = start and int(start) or 0

        feed = event.session.query(Feed).filter_by(name=name).first()

        if not feed:
            event.addresponse(u"I don't know about the %s feed", name)
            return

        feed.update()
        if not feed.entries:
            event.addresponse(u"I can't find any articles in that feed")
            return

        articles = feed.entries[start:number + start]
        articles = [
            u'%s: "%s"' % (feed.entries.index(entry) + 1,
                           html2text_file(entry.title, None).strip())
            for entry in articles
        ]
        event.addresponse(u', '.join(articles))
Пример #12
0
def html2text(html, baseurl=''):
    return optwrap(html2text_file(html, None, baseurl))
Пример #13
0
def handleMsg(mailbox, msg, is_subpart=False, strdate=""):
  """ This function handles a message object recursively, it has 
      several tasks:
      - save all of the attachments in the message
      - extract all of the text information into the message body
      - if the email contains html messages they will be converted 
        into text and added to the message body
      - extract all of the field information (To, Cc, From, ...)
        from the message objects
      
  """
  global text
  global attachments
  global fieldFrom, fieldSubject, fieldTime

  # Message/RFC822 parts are bundled this way ==============
  while isinstance(msg.get_payload(),email.Message.Message):
    msg=msg.get_payload()

  if not is_subpart:
    fieldFrom = ""
    fieldSubject = ""
    fieldTime = None    # fieldTime is a 9-item tuple
    text = ""           # the text contents of a message
    attachments = ""

  ## Set the "From" Field ==================================
  if fieldFrom == "" and msg['From'] != None:
    text += "To: %s\n" % decode_field(msg['To'])
    if msg['Cc'] != None:
      text += "Cc: %s\n" % decode_field(msg['Cc'])
    if msg['Bcc'] != None:
      text += "Bcc: %s\n" % decode_field(msg['Bcc'])
    text += "From: %s\n" % decode_field(msg['From'])
    fieldFrom = decode_field(msg['From'])

  ## Set the "Subject" Field ===============================
  if fieldSubject == "" and msg['Subject'] != None:
    fieldSubject = decode_field(msg['Subject'])
    text += "Subject: %s\n" % fieldSubject

  ## Set the "Date" Field ==================================
  if fieldTime == None and msg['Date'] != None:
    fieldTime = string2time(msg['Date'])
    strdate = time.strftime("%Y%m%d%H%M", fieldTime)

  ## Handle multipart messages recursively =================
  if msg.is_multipart():
    for submsg in msg.get_payload():
      handleMsg(mailbox, submsg, True, strdate)
  else:
    fname = msg.get_filename()
    if fname == None:
      if msg.get_content_type() == 'text/plain':
        text += "\n%s" % msg.get_payload(decode=1)
      else:
        fname = "message.htm"

    ## Save an attachment to a file ========================
    if not fname == None:
      fname = decode_field(fname)
      filename = "%s\\att_%s\\%s_%s" % (mailboxdir, mailbox, strdate, fname)
      org_filename = filename
      i = 1
      while os.path.exists(filename):
        path, ext = os.path.splitext(org_filename)
        filename = "%s (%d)%s" % (path, i, ext)
        i = i + 1

      print " Found part: %s" % filename  # for debugging purposes
      attachments += "%s\n" % filename
      fd = open (filename, "wb")
      data = msg.get_payload(decode=1)
      fd.write(data)

      # convert an html message to text
      if fname == "message.htm":
        try:
          strio = cStringIO.StringIO()
          html2text.html2text_file(data, out=strio.write)
          text += strio.getvalue()
          strio.close()
        except sgmllib.SGMLParseError, e:
          print e

      fd.close()
Пример #14
0
def get_desc(soup,url):
        if soup:
            result= ht.html2text_file(str(soup),None)
        else:
            result =unicode(html2content.get_text(url))
        return result
Пример #15
0
def get_desc(soup, url):
    if soup:
        result = ht.html2text_file(str(soup), None)
    else:
        result = unicode(html2content.get_text(url))
    return result
Пример #16
0
def write_hakyll(data, target_format):

    sys.stdout.write("writing")
    item_uids = {}
    attachments = {}

    def get_blog_path(data, path_infix='hakyll'):
        name = data['header']['link']
        name = re.sub('^https?', '', name)
        name = re.sub('[^A-Za-z0-9_.-]', '', name)
        return os.path.normpath(build_dir + '/' + path_infix + '/' + name)

    blog_dir = get_blog_path(data)

    def get_full_dir(dir):
        full_dir = os.path.normpath(blog_dir + '/' + dir)
        if (not os.path.exists(full_dir)):
            os.makedirs(full_dir)
        return full_dir

    def open_file(file):
        f = codecs.open(file, 'w', encoding='utf-8')
        return f

    def get_item_uid(item, date_prefix=False, namespace=''):
        result = None
        if namespace not in item_uids:
            item_uids[namespace] = {}

        if item['wp_id'] in item_uids[namespace]:
            result = item_uids[namespace][item['wp_id']]
        else:
            uid = []
            if (date_prefix):
                dt = datetime.strptime(item['date'], date_fmt)
                uid.append(dt.strftime('%Y-%m-%d'))
                uid.append('-')
            s_title = item['slug']
            if s_title is None or s_title == '':
                s_title = item['title']
            if s_title is None or s_title == '':
                s_title = 'untitled'
            s_title = s_title.replace(' ', '_')
            s_title = s_title.strip(' \t\n\r\'')
            s_title = re.sub('[^a-zA-Z0-9_-]', '', s_title)
            uid.append(s_title)
            fn = ''.join(uid)
            n = 1
            while fn in item_uids[namespace]:
                n = n + 1
                fn = ''.join(uid) + '_' + str(n)
                item_uids[namespace][i['wp_id']] = fn
            result = fn
        return result

    def get_item_path(item, dir=''):
        full_dir = get_full_dir(dir)
        filename_parts = [full_dir, '/']
        if build_mode == 'tree':
            m = re.search('(\d+-\d+-\d+)(-)(.+)', item['uid'])
            if m is not None:
              uiddt = datetime.strptime(m.group(1),'%Y-%m-%d').strftime('%Y/%m/%d')
              filename_parts.append(uiddt)
              if (not os.path.exists(''.join(filename_parts))):
                      os.makedirs(''.join(filename_parts))
              filename_parts.append(os.path.join('/', m.group(3)))
            else:
              filename_parts.append(item['uid'])
        else:
            filename_parts.append(item['uid'])
        if item['type'] == 'page':
            if (not os.path.exists(''.join(filename_parts))):
                    os.makedirs(''.join(filename_parts))
            filename_parts.append('/index')
        filename_parts.append('.')
        filename_parts.append(target_format)
        return ''.join(filename_parts)

    def get_attachment_path(src, dir, dir_prefix='a'):
        try:
            files = attachments[dir]
        except KeyError:
            attachments[dir] = files = {}

        try:
            filename = files[src]
        except KeyError:
            file_root, file_ext = os.path.splitext(os.path.basename(urlparse(src)[2]))
            file_infix = 1
            if file_root == '':
                file_root = '1'
            current_files = files.values()
            maybe_filename = file_root + file_ext
            while maybe_filename in current_files:
                maybe_filename = file_root + '-' + str(file_infix) + file_ext
                file_infix = file_infix + 1
            files[src] = filename = maybe_filename

        target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + dir)
        target_file = os.path.normpath(target_dir + '/' + filename)

        if (not os.path.exists(target_dir)):
            os.makedirs(target_dir)

        #if src not in attachments[dir]:
        ##print target_name
        return target_file

    for i in data['items']:
        skip_item = False

        for field, value in item_field_filter.iteritems():
            if(i[field] == value):
                skip_item = True
                break

        if(skip_item):
            continue

        sys.stdout.write(".")
        sys.stdout.flush()
        out = None

        i['title'] = i['title'].strip(' \t\n\r\'')
        yaml_header = {
            'title': i['title'],
            'date': datetime.strptime(i['date'], '%Y-%m-%d %H:%M:%S'),
            'slug': i['slug'],
            'wordpressid': int(i['wp_id']),
            'comments': i['comments'],
        }
        if i['status'] != u'publish':
            yaml_header['published'] = False

        if i['type'] == 'post':
            i['uid'] = get_item_uid(i, date_prefix=True)
            fn = get_item_path(i, dir='_posts')
            out = open_file(fn)
            yaml_header['layout'] = 'post'
        elif i['type'] == 'page':
            i['uid'] = get_item_uid(i)
            # Chase down parent path, if any
            parentpath = ''
            item = i
            while item['parent'] != "0":
                item = next((parent for parent in data['items'] if parent['wp_id'] == item['parent']), None)
                if item:
                    parentpath = get_item_uid(item) + "/" + parentpath
                else:
                    break
            fn = get_item_path(i, parentpath)
            out = open_file(fn)
            yaml_header['layout'] = 'page'
        elif i['type'] in item_type_filter:
            pass
        else:
            print "Unknown item type :: " + i['type']

        if download_images:
            for img in i['img_srcs']:
                try:
                    urlretrieve(urljoin(data['header']['link'],
                                        img.decode('utf-8')),
                                get_attachment_path(img, i['uid']))
                except:
                    print "\n unable to download " + urljoin(data['header']['link'], img.decode('utf-8'))

        if out is not None:
            def toyaml(data):
                return yaml.safe_dump(data, allow_unicode=True, default_flow_style=False).decode('utf-8')

            tax_out = {}
            for taxonomy in i['taxanomies']:
                tvalue_list = []
                for tvalue in i['taxanomies'][taxonomy]:
                    t_name = taxonomy_name_mapping.get(taxonomy, taxonomy)
                    if t_name not in tax_out:
                        tax_out[t_name] = []
                    if tvalue in tax_out[t_name]:
                        continue
                    tvalue_list.append(tvalue)
                tax_out[t_name] = ",".join(tvalue_list)

            out.write('---\n')
            if len(yaml_header) > 0:
                out.write(toyaml(yaml_header))
            if len(tax_out) > 0:
                out.write(toyaml(tax_out))

            out.write('---\n\n')
            try:
                out.write(html2text_file(i['body'], None))
            except:
                print "\n Parse error on: " + title

            out.close()
    print "\n"
Пример #17
0
 def format_title(self,title) :
     return html2text_file(title,None).strip('\r\n ').replace('\n',' ')
Пример #18
0
def _html2text(html):
    sio = StringIO()
    html2text.html2text_file(html, sio.write)
    text = sio.getvalue()
    sio.close()
    return text
Пример #19
0
def html2text(html, baseurl=''):
    return optwrap(html2text_file(html, None, baseurl))