Пример #1
0
    def parsebloglinks (self, addonlyenglish = False):
        self.bloglist = []

        leadingblogbrtags = self.soup.findAll('br', {'class':"blog-spacer"})
        for leadingblogbr in leadingblogbrtags:
            summary = leadingblogbr.nextSibling
            if type(summary) is BeautifulSoup.NavigableString:
                if addonlyenglish and Util.istextenglish(summary) == False:
                    continue
                relativeurl = leadingblogbr.nextSibling.nextSibling['href']
                self.bloglist.append('http://www.travelblog.org' + relativeurl)

        return self.bloglist
Пример #2
0
 def isvalidforindex(self):
     return self.isafrica and Util.istextenglish(self.blog.body) and len(self.blog.body) > 50
import ElasticMappings
import TravelPodParser
import Util
import LoggerConfig
import logging
from elasticsearch import Elasticsearch

logger = logging.getLogger(__name__)

response = ElasticMappings.Blog.search()[0:3500].execute()

count = 0
for blog in response:
    if Util.istextenglish(blog.body) == False:
        blog.delete()
        count+=1

logger.info("Deleted {0} foreign blogs".format(count))
Пример #4
0
 def parsebloglinks (self):
     self.bloglist = []
     for div in self.soup.findAll('div',  attrs={'class':re.compile('^blog_info$')}):
         if (Util.istextenglish(div.findAll('p')[1].text)):
             self.bloglist.append(div.findNext ('a')['href']);
     return self.bloglist