def tokens(self, tagname=None , endtag=None, stem=True, stoplanguage='english', lowercase=True): ''' return the content in the file as as list of tokens if tagname is not None then this represents a doc with tagging. tag must be specified completely for example a title tag must be specified as ''' handle = codecs.open(self.path, encoding='utf-8', errors='ignore') if tagname is None: if self.useptb: return ptb.ptbtokens(handle.read(), stem, stoplanguage, lowercase) else: return ptb.whitespace(handle.read(), stem, stoplanguage, lowercase) toks = None found = False end = False for line in handle: line = line.strip() if found: if line != endtag: toks = toks + ptb.ptbtokens(line, stem, stoplanguage, lowercase) else: end = True found = False break if line == tagname: found = True toks = [] if end and len(toks) > 0: return toks else: return None
def getheader(self, soup): soup = soup.head if soup is None: return None if soup.title: content = soup.title.get_text() if content is not None: if self.tokenize: content = ptbtokenizer.ptbtokens(content, stem= False, stoplanguage=None, lowercase=False) content = ' '.join(content) if self.title is None: self.title = content else: self.title = self.title + ' ' + content kws = soup.find_all("meta", {"name" : "keywords"}) if len(kws) > 0: content = kws[0].get("content") if content is not None: if self.tokenize: content = ptbtokenizer.ptbtokens(content, stem= False, stoplanguage=None, lowercase=False) content = ' '.join(content) if self.keywords is None: self.keywords = '' self.keywords = self.keywords + ' '+ content des = soup.find_all("meta", {"name" : "description"}) if len(des) > 0: content = des[0].get('content') if content is not None: if self.tokenize: content = ptbtokenizer.ptbtokens(content,stem=False, stoplanguage=None, lowercase=False) content = ' '.join(content) if self.description is None: self.description = '' self.description = self.description + ' '+ content
def getbodytext(self,soup): soup = soup.body if soup is None: return for tag in soup.descendants: try: tag.name except AttributeError: if tag.parent.name != 'script': content = tag.string if content is not None: if self.tokenize: content = ptbtokenizer.ptbtokens(content, stem=False, stoplanguage=None, lowercase = False) content = ' '.join(content) self.bodytext = self.bodytext + ' '+ content