예제 #1
0
 def marshal_link_metadata(self, links, linktype, baseurl, listofallfileextensions):
     '''Marshal the metadata glean-able from the link's context
     together into a dictionary for the edification of other bits
     of this program'''
     ret = []
     for link in links:
         linkdata = {}
         try:
             linkdata['Url'] = urlparse.urljoin(
                 baseurl, link['href'].encode('utf-8'))
         except: continue
         linkdata['Url'] = linkdata['Url'].decode('utf-8')
         if link.findPrevious(head_re):
             linkdata['LastHeading'] = unicode(
                 link.findPrevious(head_re).text)
         else:
             linkdata['LastHeading'] = u''
         if hasattr(link, 'text'):
             linkdata['LinkText'] = decode_htmlentities(unicode(link.text))
             st = link.text.strip(' -_')
             if os.path.splitext(st)[1] in listofallfileextensions: 
                 st = os.path.splitext(st)[0]
             st = tr_nan(st)
             linkdata['SuggestedTitle'] = decode_htmlentities(unicode(st))
         else:
             linkdata['LinkText'] = unicode(linkdata['Url'])
             linkdata['SuggestedTitle'] = unicode(linkdata['Url'])
         linkdata['PageTitle'] = self.get_context_title()
         linkdata['Tags'] = tag_by_data(linkdata)
         linkdata['InferredFileType'] = linktype
         ret.append(linkdata)
     return ret
예제 #2
0
 def marshal_link_metadata(self, links, linktype, baseurl,
                           listofallfileextensions):
     '''Marshal the metadata glean-able from the link's context
     together into a dictionary for the edification of other bits
     of this program'''
     ret = []
     for link in links:
         linkdata = {}
         try:
             linkdata['Url'] = urlparse.urljoin(
                 baseurl, link['href'].encode('utf-8'))
         except:
             continue
         linkdata['Url'] = linkdata['Url'].decode('utf-8')
         if link.findPrevious(head_re):
             linkdata['LastHeading'] = unicode(
                 link.findPrevious(head_re).text)
         else:
             linkdata['LastHeading'] = u''
         if hasattr(link, 'text'):
             linkdata['LinkText'] = decode_htmlentities(unicode(link.text))
             st = link.text.strip(' -_')
             if os.path.splitext(st)[1] in listofallfileextensions:
                 st = os.path.splitext(st)[0]
             st = tr_nan(st)
             linkdata['SuggestedTitle'] = decode_htmlentities(unicode(st))
         else:
             linkdata['LinkText'] = unicode(linkdata['Url'])
             linkdata['SuggestedTitle'] = unicode(linkdata['Url'])
         linkdata['PageTitle'] = self.get_context_title()
         linkdata['Tags'] = tag_by_data(linkdata)
         linkdata['InferredFileType'] = linktype
         ret.append(linkdata)
     return ret
예제 #3
0
 def _get_metadata(self):
     # ret = []
     linkdata = {}
     # bibtexdata = self._bs.find('div', id='bibtex').p.prettify()
     # print bibtexdata.split('\n') 
     # A fairly brute force of coercing the bit of html containing
     # the bibtex into a useable format
     linkdata['LastHeading'] = u''
     linkdata['LinkText'] = unicode(self._bs.find('ul', id='clinks').li.a['title'])
     linkdata['SuggestedTitle'] = self.get_context_title()
     linkdata['PageTitle'] = self.get_context_title()
     linkdata['InferredFileType'] = u'pdf'
     linkdata['Tags'] = []
     try:
         bibtexdata = [ 
             unicode(decode_htmlentities(l.replace(' ', '')))
             for l in self._bs.find('div', id='bibtex').p.prettify().split('\n') 
             if len(l) > 0 and l[0] != '<' 
             ]
         bibtexdata = u'\n'.join(bibtexdata)
         f = open(os.path.join(CACHEDIR, 'bibtexsc.bib'), 'w')
         # f.write(bibtexdata)
         f.write(unicode(decode_htmlentities(bibtexdata)))
         f.close()
         btread = pybtex_entries_from_file(os.path.join(CACHEDIR, 'bibtexsc.bib'))
         for ky, vl in btread.items(): print pybtex_to_pieberry(ky, vl)
         
         linkdata['BibTeX'] = btread.items()[0]
         linkdata['SuggestedTitle'] = btread.items()[0].fields['title']
     except Exception, exc:
         traceback.print_exc()
예제 #4
0
    def _get_metadata(self):
        # ret = []
        linkdata = {}
        # bibtexdata = self._bs.find('div', id='bibtex').p.prettify()
        # print bibtexdata.split('\n')
        # A fairly brute force of coercing the bit of html containing
        # the bibtex into a useable format
        linkdata['LastHeading'] = u''
        linkdata['LinkText'] = unicode(
            self._bs.find('ul', id='clinks').li.a['title'])
        linkdata['SuggestedTitle'] = self.get_context_title()
        linkdata['PageTitle'] = self.get_context_title()
        linkdata['InferredFileType'] = u'pdf'
        linkdata['Tags'] = []
        try:
            bibtexdata = [
                unicode(decode_htmlentities(l.replace('&nbsp;', ''))) for l in
                self._bs.find('div', id='bibtex').p.prettify().split('\n')
                if len(l) > 0 and l[0] != '<'
            ]
            bibtexdata = u'\n'.join(bibtexdata)
            f = open(os.path.join(CACHEDIR, 'bibtexsc.bib'), 'w')
            # f.write(bibtexdata)
            f.write(unicode(decode_htmlentities(bibtexdata)))
            f.close()
            btread = pybtex_entries_from_file(
                os.path.join(CACHEDIR, 'bibtexsc.bib'))
            for ky, vl in btread.items():
                print pybtex_to_pieberry(ky, vl)

            linkdata['BibTeX'] = btread.items()[0]
            linkdata['SuggestedTitle'] = btread.items()[0].fields['title']
        except Exception, exc:
            traceback.print_exc()
예제 #5
0
 def get_context_title(self):
     return unicode(decode_htmlentities(self._bs.title.string))
예제 #6
0
 def get_context_title(self):
     return unicode(decode_htmlentities(self._bs.title.string))