def get_context_title(self): ret = '' if self.titlesuggestions.has_key('content-disposition'): ret = self.titlesuggestions['content-disposition'] else: ret = self.titlesuggestions['filename'] if ret[-4:] in ('.pdf', '.Pdf', '.PDF'): return tr_nan(unicode(ret[:-4])) else: return tr_nan(unicode(ret))
def marshal_link_metadata(self, links, linktype, baseurl, listofallfileextensions): '''Marshal the metadata glean-able from the link's context together into a dictionary for the edification of other bits of this program''' ret = [] for link in links: linkdata = {} try: linkdata['Url'] = urlparse.urljoin( baseurl, link['href'].encode('utf-8')) except: continue linkdata['Url'] = linkdata['Url'].decode('utf-8') if link.findPrevious(head_re): linkdata['LastHeading'] = unicode( link.findPrevious(head_re).text) else: linkdata['LastHeading'] = u'' if hasattr(link, 'text'): linkdata['LinkText'] = decode_htmlentities(unicode(link.text)) st = link.text.strip(' -_') if os.path.splitext(st)[1] in listofallfileextensions: st = os.path.splitext(st)[0] st = tr_nan(st) linkdata['SuggestedTitle'] = decode_htmlentities(unicode(st)) else: linkdata['LinkText'] = unicode(linkdata['Url']) linkdata['SuggestedTitle'] = unicode(linkdata['Url']) linkdata['PageTitle'] = self.get_context_title() linkdata['Tags'] = tag_by_data(linkdata) linkdata['InferredFileType'] = linktype ret.append(linkdata) return ret