def main(): ''' Set up progress bar. ''' page_count = reduce(lambda cnt, site: cnt + len(site['pages']), SITES, 0) widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Saved ', Counter(), '/', str(page_count), ' sites.' ] pbar = ProgressBar(widgets=widgets, maxval=page_count) pbar.start() page_index = 0 for site in SITES: selector = site['selector'] for url in site['pages']: resp = requests_session.get(url) soup = BeautifulSoup(resp.text, 'lxml') page = Page.create( text=soup.text, location=url, title=soup.title.text if soup.title is not None else '' ) code_nodes = soup.select(selector) for node in code_nodes: save_snippet(node, page) page_index += 1 pbar.update(page_index) pbar.finish()
def save_page(language, query, link, rank, title): try: pg = Page.create(language=language, query=query, link=link, rank=rank, title=title) except peewee.IntegrityError: pg = Page.get(Page.language == language, Page.link == link, Page.query == query) pg.language = language pg.rank = rank pg.title = title pg.save()
def get_page_object(page_id): """ Retrieves a Page from a given page ID, getting page data from the API and saving the Page if it doesn't already exist in the DB. """ try: return Page.get(Page.page_id == page_id) except DoesNotExist: data = get_raw_page_by_id(page_id) return Page.create(page_id=data['pageid'], page_title=data['title'])
def create_one_album(self, album_title = 'Album'): pictures = Picture.objects.all() album = Album.create(album_title, 'First Page', self.pick_a_random_profile()) page_count = random.randrange(5,101) # 5 <= page_count <= 100 for i in xrange(1, page_count): text = 'Page ' + i.__str__() page = Page.create(text) picture_count = random.randrange(1,5) # 1 <= picture_count <= 4 for j in xrange(1, picture_count): # pick a random picture from database and add it to a page random_index = random.randrange(0, pictures.count()) random_picture = pictures[random_index] page.add_picture(random_picture, j.__str__()) # Page ready to be added to album album.add_page(page) return self
try: mark("check cache") f = urllib2.urlopen(page_url) #TODO rate limit this or find some way to stop ourselves from being used as a DOS tool page_content = unicode(f.read(),'utf-8') #Create a cached page that we can fetch by URL later cached_page = CachedPage() cached_page.url = page_url cached_page.original = page_content cached_page.date = datetime.now() cached_page.save() mark("download page") #print "saved cached page %d" % cached_page.id except urllib2.HTTPError, error: raise error return None page = Page.create() page.url = page_url page.original = page_content page.save() css_stylesheets = [] css_tags = [] doc_tree = lxml.html.document_fromstring(page_content).getroottree() mark("save page") makeLinksAbsolute(doc_tree, page_url) mark("make links absolute") parseStyleAttributes(doc_tree, css_tags, page) mark("parse style attributes") parseStyleTags(doc_tree, css_stylesheets, page) mark("parse style tags")