示例#1
0
    def tokens(self, tagname=None , endtag=None, stem=True, stoplanguage='english', lowercase=True):
        '''
          return the content in the file as as list of tokens
          if tagname is not None then this represents a doc with tagging.
          tag must be specified completely for example a title tag must be specified as
         
        '''
        handle = codecs.open(self.path, encoding='utf-8', errors='ignore')
        if tagname is None:
            if self.useptb:
                return ptb.ptbtokens(handle.read(), stem, stoplanguage, lowercase)
            else:
                return ptb.whitespace(handle.read(), stem, stoplanguage, lowercase)

        toks = None
        found = False
        end = False
        for line in handle:
            line = line.strip()
            if found:
                if line != endtag:
                    toks = toks + ptb.ptbtokens(line, stem, stoplanguage, lowercase)
                else:
                    end = True
                    found = False
                    break
            if line == tagname:
                found = True
                toks = []
        if end and len(toks) > 0:
            return toks
        else:
            return None
示例#2
0
    def getheader(self, soup):
        soup = soup.head
        if soup is None:
            return None
        if soup.title:
            content = soup.title.get_text()
            if content is not None:
                if self.tokenize:
                    content = ptbtokenizer.ptbtokens(content, stem= False, stoplanguage=None, lowercase=False)
                    content = ' '.join(content)
                if self.title is None:
                    self.title = content
                else:
                    self.title = self.title + ' ' + content
        kws = soup.find_all("meta", {"name" : "keywords"})

        if len(kws) > 0:
            content  = kws[0].get("content")
            if content is not None:
                if self.tokenize:
                    content = ptbtokenizer.ptbtokens(content, stem= False, stoplanguage=None, lowercase=False)
                    content = ' '.join(content)
                if self.keywords is None:
                    self.keywords = ''
                self.keywords = self.keywords + ' '+ content

        des = soup.find_all("meta", {"name" : "description"})
        if len(des) > 0:
            content = des[0].get('content')
            if content is not None:
                if self.tokenize:
                    content = ptbtokenizer.ptbtokens(content,stem=False, stoplanguage=None, lowercase=False)
                    content = ' '.join(content)
                if self.description is None:
                    self.description = ''
                self.description = self.description + ' '+ content
示例#3
0
 def getbodytext(self,soup):
     soup = soup.body
     if soup is None:
         return
     for tag in soup.descendants:
         try:
             tag.name
         except AttributeError:
             if tag.parent.name != 'script':
                 content = tag.string
                 if content is not None:
                     if self.tokenize:
                         content  = ptbtokenizer.ptbtokens(content, stem=False, stoplanguage=None, lowercase = False)
                         content = ' '.join(content)
                     self.bodytext = self.bodytext + ' '+ content