示例#1
0
 def get_context_title(self):
     ret = ''
     if self.titlesuggestions.has_key('content-disposition'):
         ret = self.titlesuggestions['content-disposition']
     else:
         ret = self.titlesuggestions['filename']
     if ret[-4:] in ('.pdf', '.Pdf', '.PDF'):
         return tr_nan(unicode(ret[:-4]))
     else:
         return tr_nan(unicode(ret))
示例#2
0
 def marshal_link_metadata(self, links, linktype, baseurl, listofallfileextensions):
     '''Marshal the metadata glean-able from the link's context
     together into a dictionary for the edification of other bits
     of this program'''
     ret = []
     for link in links:
         linkdata = {}
         try:
             linkdata['Url'] = urlparse.urljoin(
                 baseurl, link['href'].encode('utf-8'))
         except: continue
         linkdata['Url'] = linkdata['Url'].decode('utf-8')
         if link.findPrevious(head_re):
             linkdata['LastHeading'] = unicode(
                 link.findPrevious(head_re).text)
         else:
             linkdata['LastHeading'] = u''
         if hasattr(link, 'text'):
             linkdata['LinkText'] = decode_htmlentities(unicode(link.text))
             st = link.text.strip(' -_')
             if os.path.splitext(st)[1] in listofallfileextensions: 
                 st = os.path.splitext(st)[0]
             st = tr_nan(st)
             linkdata['SuggestedTitle'] = decode_htmlentities(unicode(st))
         else:
             linkdata['LinkText'] = unicode(linkdata['Url'])
             linkdata['SuggestedTitle'] = unicode(linkdata['Url'])
         linkdata['PageTitle'] = self.get_context_title()
         linkdata['Tags'] = tag_by_data(linkdata)
         linkdata['InferredFileType'] = linktype
         ret.append(linkdata)
     return ret
示例#3
0
 def marshal_link_metadata(self, links, linktype, baseurl,
                           listofallfileextensions):
     '''Marshal the metadata glean-able from the link's context
     together into a dictionary for the edification of other bits
     of this program'''
     ret = []
     for link in links:
         linkdata = {}
         try:
             linkdata['Url'] = urlparse.urljoin(
                 baseurl, link['href'].encode('utf-8'))
         except:
             continue
         linkdata['Url'] = linkdata['Url'].decode('utf-8')
         if link.findPrevious(head_re):
             linkdata['LastHeading'] = unicode(
                 link.findPrevious(head_re).text)
         else:
             linkdata['LastHeading'] = u''
         if hasattr(link, 'text'):
             linkdata['LinkText'] = decode_htmlentities(unicode(link.text))
             st = link.text.strip(' -_')
             if os.path.splitext(st)[1] in listofallfileextensions:
                 st = os.path.splitext(st)[0]
             st = tr_nan(st)
             linkdata['SuggestedTitle'] = decode_htmlentities(unicode(st))
         else:
             linkdata['LinkText'] = unicode(linkdata['Url'])
             linkdata['SuggestedTitle'] = unicode(linkdata['Url'])
         linkdata['PageTitle'] = self.get_context_title()
         linkdata['Tags'] = tag_by_data(linkdata)
         linkdata['InferredFileType'] = linktype
         ret.append(linkdata)
     return ret