예제 #1
0
def main():

    ''' Set up progress bar. '''
    page_count = reduce(lambda cnt, site: cnt + len(site['pages']), SITES, 0)
    widgets = [
        'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(),
        '  Saved ', Counter(), '/', str(page_count), ' sites.'
    ]
    pbar = ProgressBar(widgets=widgets, maxval=page_count)
    pbar.start()
    page_index = 0

    for site in SITES:

        selector = site['selector']

        for url in site['pages']:

            resp = requests_session.get(url)
            soup = BeautifulSoup(resp.text, 'lxml')
            page = Page.create(
                text=soup.text,
                location=url,
                title=soup.title.text if soup.title is not None else ''
            )

            code_nodes = soup.select(selector)
            for node in code_nodes:
                save_snippet(node, page)

            page_index += 1
            pbar.update(page_index)

    pbar.finish()
예제 #2
0
def save_page(language, query, link, rank, title):
    try:
        pg = Page.create(language=language, query=query, link=link, rank=rank, title=title)
    except peewee.IntegrityError:
        pg = Page.get(Page.language == language, Page.link == link, Page.query == query)
        pg.language = language
        pg.rank = rank
        pg.title = title
        pg.save()
예제 #3
0
def get_page_object(page_id):
    """
    Retrieves a Page from a given page ID, getting page data from
    the API and saving the Page if it doesn't already exist in the DB.
    """
    try:
        return Page.get(Page.page_id == page_id)
    except DoesNotExist:
        data = get_raw_page_by_id(page_id)
        return Page.create(page_id=data['pageid'],
                           page_title=data['title'])
 def create_one_album(self, album_title = 'Album'):
     pictures = Picture.objects.all()
     album = Album.create(album_title, 'First Page', self.pick_a_random_profile())
     page_count = random.randrange(5,101) # 5 <= page_count <= 100
     for i in xrange(1, page_count):
         text = 'Page ' + i.__str__()
         page = Page.create(text)
         picture_count = random.randrange(1,5) # 1 <= picture_count <= 4
         for j in xrange(1, picture_count):
             # pick a random picture from database and add it to a page
             random_index = random.randrange(0, pictures.count())
             random_picture = pictures[random_index]
             page.add_picture(random_picture, j.__str__())  
         # Page ready to be added to album
         album.add_page(page)
     return self
예제 #5
0
        try:
            mark("check cache")
            f =  urllib2.urlopen(page_url) #TODO rate limit this or find some way to stop ourselves from being used as a DOS tool
            page_content = unicode(f.read(),'utf-8')
            #Create a cached page that we can fetch by URL later
            cached_page = CachedPage()
            cached_page.url = page_url
            cached_page.original = page_content
            cached_page.date = datetime.now()
            cached_page.save()
            mark("download page")
            #print "saved cached page %d" % cached_page.id
        except urllib2.HTTPError, error:
            raise error
            return None
    page = Page.create()
    page.url = page_url
    page.original = page_content
    page.save()
    css_stylesheets = []
    css_tags = []
    
    doc_tree = lxml.html.document_fromstring(page_content).getroottree()

    mark("save page")
    makeLinksAbsolute(doc_tree, page_url)
    mark("make links absolute")
    parseStyleAttributes(doc_tree, css_tags, page)
    mark("parse style attributes")
    parseStyleTags(doc_tree, css_stylesheets, page)
    mark("parse style tags")