Пример #1
0
 def GET(self, uid):
     page = Page()
     page.title = "教学视频"
     try:
         notes_html = open("%s/data/notes/%s/prod/1.txt" % (page.base_path, uid)).read()
     except:
         return 'None'
     return render.notes(page, notes_html)
Пример #2
0
 def __query_to_tokens(self, query):
     """
     Split the query into tokens
     """
     page = Page()
     page.content = query
     lexer = TokenLexer(page)
     return list(lexer.tokens())
Пример #3
0
def findDedicateInPage(pageIndex, buff):
	print("START: scan page {0}".format(pageIndex))	
	page = Page(URL, pageIndex, PAGE_SIZE)	
	for photo in page.fetchPhotos():
		if Photo.fetchOneById(photo.id) is None:
			buff.append(photo)
			print("SCAN: {0} is qulified because cannot find record locally".format(photo.id))
		else:
			print("SKIP: {0} has a local record.".format(photo.id))
Пример #4
0
    def test_tokens_AllExpectedTokensAreReturned(self):
        page = Page()
        page.content = "yEaH,PYthON?!,. is   not  . so,!? bad, as i thought:."

        lexer = TokenLexer(page)
        tokens = list(lexer.tokens())

        self.assertListEqual(tokens, [
            'yeah', 'python', 'not',
            'bad', 'i', 'thought'
        ])
Пример #5
0
    def create(self):
     #try:
        document = Document.get(self.params.get('document'))
        page = Page(  name=self.params.get('name'),
                      document=document,
                      content=self.params.get('content'),
                      order=int(self.params.get('order'))
              )
        # データの保存
        page.put()

        # URL '/page/' にリダイレクト
        self.redirect('/document/edit/'+self.params.get('document') )
Пример #6
0
    def test_str_outputStringIsAsExpected(self):
        test_page = Page()
        test_page.title = 'D01'
        test_page.content = 'Bla Bla Blub'

        expected_output = os.linesep.join([
            '---------------------------------------------------------------------------',
            'D01',
            '---------------------------------------------------------------------------',
            'Bla Bla Blub'
        ])

        self.assertEqual(expected_output, str(test_page))
Пример #7
0
    def _get_links_and_forms_and_store(self, url):
        if Page.query.filter(Page.url == url).first():
            return []

        r = requests.get(url, cookies=self.cookies)
        soup = BeautifulSoup(r.text, "html.parser")

        try:
            page = Page(self.website_id, url)
            page.save_to_db()
            self._get_forms(soup, page.id, url)
        except Exception as e:
            print("Row already exists", e)

        return self._get_links(url, soup)
Пример #8
0
def getPaging(pageIndex, downloadPhotoFn, tracker):
	pageObject = Page(URL, pageIndex, PAGE_SIZE)
	
	photos = pageObject.fetchPhotos()
	
	print("PREPARE: create download task for Page: {0}".format(pageIndex))
	with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_DOWNLOAD_WORKER) as downloadExecutor:
		
		executor = downloadExecutor
		#executor = None

		for photo in photos:			
			if tracker is not None:
				tracker.addTask(1)				
			
			if executor is None:
				downloadPhotoFn(photo, tracker)
			else:			
				executor.submit(downloadPhotoFn, photo, tracker)
Пример #9
0
 def test_get_product(self):
     crawler = Crawler()
     base_path = os.path.abspath('.') + os.sep + 'tests'
     file_base_path = 'file:///' + base_path
     link = os.path.join(file_base_path, 'produto_1.html')
     epoca = EpocaCosmeticos()
     print epoca.get_product_pages()
     product = Page(EpocaCosmeticos(), link).get_product()
     self.assertEquals('Produto 1', product.name)
     self.assertEquals('Titulo do Produto 1', product.title)
     self.assertEquals(link, product.url)
Пример #10
0
def get_article_list(index, type=''):
    try:
        index = int(index)
    except ValueError:
        index = 1
    type = str(type)

    query = {}
    if type != '' and type != 'all':
        tt = article_dao.get_article_type_by_name(str(type))
        if tt:
            query = {'type': str(tt['_id'])}
        else:
            return [], Page()

    item_limit = constants.item_limit * constants.item_limit_rate
    total_count = article_dao.get_article_count(query)
    page_count = int(math.ceil(total_count * 1.0 / item_limit))

    if index > page_count or index < 1:
        offset = 0
        index = 1
    else:
        offset = (index - 1) * item_limit

    page = Page()
    page.index = index
    page.count = constants.page_count if page_count > constants.page_count else page_count

    if page.index <= page.count / 2:
        page.start = 1
    else:
        page.start = page.index - int(page.count / 2)
        if page.start > page_count - page.count:
            page.start = page_count - page.count + 1

    result = list(article_dao.get_article_list(offset, item_limit, query))

    article_list = []
    for art in result:
        model = Article()
        model._id = art['_id']
        model.title = art['title']
        model.tag = art['tag']
        model.time = art['time']
        model.content = art['content']
        model.description = art['description']
        model.visit_count = art['visit_count']
        tt = article_dao.get_article_type_by_id(art['type'])
        if tt:
            model.type = tt['_id']
            model.type_name = tt['type']

        article_list.append(model)
    return article_list, page
Пример #11
0
    def link_crawler(self, crawlable, max_delay):
        main_url, product_url_regex = crawlable.get_home_page(
        ), crawlable.get_product_pages()

        main_page = Page(crawlable, main_url)
        main_crawling_pages = main_page.get_main_crawling_pages()
        crawl_queue = main_crawling_pages[:]
        current_main_page = 0
        main_pages_length = len(main_crawling_pages)

        all_visited, product_list = [], []
        products_visited = []

        while crawl_queue:
            url = crawl_queue.pop()

            try:
                if url not in all_visited:
                    if url in main_crawling_pages:
                        current_main_page += 1
                        print('\n%d out of %d main pages\n' %
                              (current_main_page, main_pages_length))

                    if max_delay and max_delay > 0:
                        time.sleep(
                            random.randint(0, max_delay)
                        )  #Making a little bit more difficult to be caught

                    page = Page(crawlable, url)
                    all_visited.append(url)

                    if page.is_product and page.url not in products_visited:
                        product_list.append(page.get_product())
                        products_visited.append(page.url)

                    page_links = page.get_page_links()
                    for link in page_links:
                        if link not in all_visited:
                            crawl_queue.append(link)

            except Exception as e:
                traceback.print_exc(file=sys.stdout)
                if url not in all_visited:
                    all_visited.append(url)

        print(str(len(product_list)) + ' products found.')
        return product_list
Пример #12
0
    def parse(self, text, url):
        dom = self.parseDocument(text)
        page = Page()
        page.title = self.get_text_from_element('title')
        page.content = self.remove_a_tags(self.get_text_from_element('body'))
        page.url = url

        def read_link(link):
            return URLUtils.join_relurl_to_absurl(url, link['href'])

        page.out_links = [read_link(link) for link in dom.select('a[href]')]
        page.out_links = ListUtil.to_list_without_duplicated_entries(page.out_links)

        return page
Пример #13
0
 def GET(self):
     page = Page()
     page.curnav = 1
     return render.site(page)
Пример #14
0
 def GET(self):
     page = Page();
     page.title = "信息学竞赛培训"
     page.curnav = 4
     return render.noip(page);
Пример #15
0
 def GET(self):
     page = Page();
     page.title = "课程设置"
     page.curnav = 2
     return render.lesson(page);
Пример #16
0
        clock.tick(120)


# Menu Components
button_dimensions = (100, 40)
font_style = "lucidaconsole"
title_font = pygame.font.SysFont(font_style, 72)
title = title_font.render("BLOCKY", True, (255, 0, 0))
start_button = Button("start_button", (0, 0), button_dimensions, main, "Start",
                      pygame.font.SysFont(font_style, 12))
run_ai_button = Button("run_ai_button", (0, 0), button_dimensions, test_ai,
                       "Run A.I.", pygame.font.SysFont(font_style, 12))
info_button = Button("info_button", (0, 0), button_dimensions, info_page,
                     "Info", pygame.font.SysFont(font_style, 12))
button_list = [start_button, run_ai_button, info_button]
menu_page = Page(display, button_list)
menu_page.arrange_buttons("vertical",
                          (display_width // 2 - button_dimensions[0] // 2,
                           display_height // 2 - button_dimensions[1] // 2),
                          60)

# Info Components
info_image = pygame.image.load('data/control_info.jpg')
info_image = pygame.transform.scale(info_image,
                                    (display_width // 2, display_height // 2))
back_button = Button(
    "back_button",
    ((display_width - button_dimensions[0]) / 2, display_height / 2 + 100),
    button_dimensions, menu, "Back", pygame.font.SysFont(font_style, 12))
info_page = Page(display, [back_button])
Пример #17
0
 def GET(self):
     page = Page();
     page.title = "关于千知 | 联系我们"
     page.curnav = 5
     return render.about(page);
Пример #18
0
 def GET(self):
     page = Page();
     page.title = "名师介绍"
     page.curnav = 3
     return render.teacher(page);
Пример #19
0
 def show(self):
     self.site = Site.all().get();
     self.categories = Category.all().order('order').get()
     self.rec = Page.get(self.params.get('id'))
Пример #20
0
    def get_topnews(results=8):
        URL = "https://ajax.googleapis.com/ajax/services/search/news?v=1.0&ned=%s&topic=%s&rsz=%d"
        editions = ("es_cl", "en_us")
        topics = {"w": "Internacional", "h": "Titulares"}
        i = 0

        for edition in editions:
            for topic in topics:
                url = URL % (edition, topic, results)
                print F, url
                response = urllib2.urlopen(url)
                data = response.read()

                news = json.loads(data)
                if news["responseStatus"] == 200:
                    for result in news["responseData"]["results"]:
                        data = {}

                        data["title"] = result["titleNoFormatting"]
                        data["locale"] = edition
                        data["date"] = result["publishedDate"]
                        data["url"] = result["url"]
                        data["type"] = "news"
                        data["id"] = md5(data["url"]).hexdigest()
                        data["content"] = ""

                        event = {}
                        event["title"] = data["title"]
                        event["locale"] = data["locale"]
                        event["description"] = result["content"]
                        event["date"] = data["date"]
                        e_id = event["id"] = md5("%s %s" % (repr(data["title"]), data["url"])).hexdigest()

                        print F, repr("Crawled news: %s" % data["title"])
                        e = Event(event)
                        e.save()

                        n = Page(data)
                        n.parent_id = e_id
                        n.save()

                        if result.has_key("relatedStories"):
                            for related in result["relatedStories"]:
                                data = {}
                                data["title"] = related["titleNoFormatting"]
                                data["locale"] = edition
                                data["date"] = related["publishedDate"]
                                data["url"] = related["url"]
                                data["id"] = md5(data["url"]).hexdigest()
                                data["type"] = "news"
                                data["content"] = ""

                                print F, repr("Related news: %s" % data["title"])
                                n = Page(data)
                                n.parent_id = e_id
                                n.save()
                                i += 1
                else:
                    print F, news["responseDetails"]

        print F, "total news collected: %d" % i
Пример #21
0
    def get_top_news(self):
        tag = "[crawler/get_top_news]"

        base_url = settings.GN_BASE_URL
        editions = settings.GN_EDITIONS
        topics = settings.GN_TOPICS
        num_news = settings.GN_NUM_NEWS

        params = {
            'v': '1.0',
            'ned': '',
            'topic': '',
            'rsz': num_news
        }

        events = []
        pages = []

        for edition in editions:
            for topic in topics:
                params['ned'] = edition
                params['topic'] = topic

                url = base_url + '?' + urlencode(params)
                print tag, 'getting url', url
                response = urlopen(url)
                data = response.read()

                if data != '':
                    news = json.loads(data)
                else:
                    return None

                if news['responseStatus'] != 200:
                    return None

                for result in news['responseData']['results']:
                    data = {}
                    data['title'] = result['titleNoFormatting']
                    data['date'] = result['publishedDate']
                    data['url'] = result['url']
                    data['type'] = 'news'

                    # quitar la query para tener una unica url
                    par = urlparse.urlparse(unquote(data['url']))
                    data['url'] = quote(par.scheme + '://' + par.netloc + par.path)

                    event = {}
                    event['title'] = data['title']
                    event['locale'] = edition
                    event['description'] = result['content']
                    event['date'] = data['date']
                    event['url'] = data['url']

                    event = Event(event)
                    page = Page(data)
                    page.parent_id = event.id

                    events.append(event)
                    pages.append(page)

                    print tag, "event:", data['title']

                    if not 'relatedStories' in result:
                        continue

                    for related in result['relatedStories']:
                        data = {}
                        data['title'] = related['titleNoFormatting']
                        data['date'] = related['publishedDate']
                        data['url'] = related['url']
                        data['type'] = 'news'

                        # quitar la query para tener una unica url
                        par = urlparse.urlparse(unquote(data['url']))
                        data['url'] = quote(par.scheme + '://' + par.netloc + par.path)

                        page = Page(data)
                        page.parent_id = event.id
                        pages.append(page)

                        print tag, "page:", data['title']

        self.__save_events(events, pages)