def parsebloglinks (self, addonlyenglish = False): self.bloglist = [] leadingblogbrtags = self.soup.findAll('br', {'class':"blog-spacer"}) for leadingblogbr in leadingblogbrtags: summary = leadingblogbr.nextSibling if type(summary) is BeautifulSoup.NavigableString: if addonlyenglish and Util.istextenglish(summary) == False: continue relativeurl = leadingblogbr.nextSibling.nextSibling['href'] self.bloglist.append('http://www.travelblog.org' + relativeurl) return self.bloglist
def isvalidforindex(self): return self.isafrica and Util.istextenglish(self.blog.body) and len(self.blog.body) > 50
import ElasticMappings import TravelPodParser import Util import LoggerConfig import logging from elasticsearch import Elasticsearch logger = logging.getLogger(__name__) response = ElasticMappings.Blog.search()[0:3500].execute() count = 0 for blog in response: if Util.istextenglish(blog.body) == False: blog.delete() count+=1 logger.info("Deleted {0} foreign blogs".format(count))
def parsebloglinks (self): self.bloglist = [] for div in self.soup.findAll('div', attrs={'class':re.compile('^blog_info$')}): if (Util.istextenglish(div.findAll('p')[1].text)): self.bloglist.append(div.findNext ('a')['href']); return self.bloglist