def marshal_link_metadata(self, links, linktype, baseurl, listofallfileextensions): '''Marshal the metadata glean-able from the link's context together into a dictionary for the edification of other bits of this program''' ret = [] for link in links: linkdata = {} try: linkdata['Url'] = urlparse.urljoin( baseurl, link['href'].encode('utf-8')) except: continue linkdata['Url'] = linkdata['Url'].decode('utf-8') if link.findPrevious(head_re): linkdata['LastHeading'] = unicode( link.findPrevious(head_re).text) else: linkdata['LastHeading'] = u'' if hasattr(link, 'text'): linkdata['LinkText'] = decode_htmlentities(unicode(link.text)) st = link.text.strip(' -_') if os.path.splitext(st)[1] in listofallfileextensions: st = os.path.splitext(st)[0] st = tr_nan(st) linkdata['SuggestedTitle'] = decode_htmlentities(unicode(st)) else: linkdata['LinkText'] = unicode(linkdata['Url']) linkdata['SuggestedTitle'] = unicode(linkdata['Url']) linkdata['PageTitle'] = self.get_context_title() linkdata['Tags'] = tag_by_data(linkdata) linkdata['InferredFileType'] = linktype ret.append(linkdata) return ret
def _get_metadata(self): # ret = [] linkdata = {} # bibtexdata = self._bs.find('div', id='bibtex').p.prettify() # print bibtexdata.split('\n') # A fairly brute force of coercing the bit of html containing # the bibtex into a useable format linkdata['LastHeading'] = u'' linkdata['LinkText'] = unicode(self._bs.find('ul', id='clinks').li.a['title']) linkdata['SuggestedTitle'] = self.get_context_title() linkdata['PageTitle'] = self.get_context_title() linkdata['InferredFileType'] = u'pdf' linkdata['Tags'] = [] try: bibtexdata = [ unicode(decode_htmlentities(l.replace(' ', ''))) for l in self._bs.find('div', id='bibtex').p.prettify().split('\n') if len(l) > 0 and l[0] != '<' ] bibtexdata = u'\n'.join(bibtexdata) f = open(os.path.join(CACHEDIR, 'bibtexsc.bib'), 'w') # f.write(bibtexdata) f.write(unicode(decode_htmlentities(bibtexdata))) f.close() btread = pybtex_entries_from_file(os.path.join(CACHEDIR, 'bibtexsc.bib')) for ky, vl in btread.items(): print pybtex_to_pieberry(ky, vl) linkdata['BibTeX'] = btread.items()[0] linkdata['SuggestedTitle'] = btread.items()[0].fields['title'] except Exception, exc: traceback.print_exc()
def _get_metadata(self): # ret = [] linkdata = {} # bibtexdata = self._bs.find('div', id='bibtex').p.prettify() # print bibtexdata.split('\n') # A fairly brute force of coercing the bit of html containing # the bibtex into a useable format linkdata['LastHeading'] = u'' linkdata['LinkText'] = unicode( self._bs.find('ul', id='clinks').li.a['title']) linkdata['SuggestedTitle'] = self.get_context_title() linkdata['PageTitle'] = self.get_context_title() linkdata['InferredFileType'] = u'pdf' linkdata['Tags'] = [] try: bibtexdata = [ unicode(decode_htmlentities(l.replace(' ', ''))) for l in self._bs.find('div', id='bibtex').p.prettify().split('\n') if len(l) > 0 and l[0] != '<' ] bibtexdata = u'\n'.join(bibtexdata) f = open(os.path.join(CACHEDIR, 'bibtexsc.bib'), 'w') # f.write(bibtexdata) f.write(unicode(decode_htmlentities(bibtexdata))) f.close() btread = pybtex_entries_from_file( os.path.join(CACHEDIR, 'bibtexsc.bib')) for ky, vl in btread.items(): print pybtex_to_pieberry(ky, vl) linkdata['BibTeX'] = btread.items()[0] linkdata['SuggestedTitle'] = btread.items()[0].fields['title'] except Exception, exc: traceback.print_exc()
def get_context_title(self): return unicode(decode_htmlentities(self._bs.title.string))