def GET(self, uid): page = Page() page.title = "教学视频" try: notes_html = open("%s/data/notes/%s/prod/1.txt" % (page.base_path, uid)).read() except: return 'None' return render.notes(page, notes_html)
def __query_to_tokens(self, query): """ Split the query into tokens """ page = Page() page.content = query lexer = TokenLexer(page) return list(lexer.tokens())
def findDedicateInPage(pageIndex, buff): print("START: scan page {0}".format(pageIndex)) page = Page(URL, pageIndex, PAGE_SIZE) for photo in page.fetchPhotos(): if Photo.fetchOneById(photo.id) is None: buff.append(photo) print("SCAN: {0} is qulified because cannot find record locally".format(photo.id)) else: print("SKIP: {0} has a local record.".format(photo.id))
def test_tokens_AllExpectedTokensAreReturned(self): page = Page() page.content = "yEaH,PYthON?!,. is not . so,!? bad, as i thought:." lexer = TokenLexer(page) tokens = list(lexer.tokens()) self.assertListEqual(tokens, [ 'yeah', 'python', 'not', 'bad', 'i', 'thought' ])
def create(self): #try: document = Document.get(self.params.get('document')) page = Page( name=self.params.get('name'), document=document, content=self.params.get('content'), order=int(self.params.get('order')) ) # データの保存 page.put() # URL '/page/' にリダイレクト self.redirect('/document/edit/'+self.params.get('document') )
def test_str_outputStringIsAsExpected(self): test_page = Page() test_page.title = 'D01' test_page.content = 'Bla Bla Blub' expected_output = os.linesep.join([ '---------------------------------------------------------------------------', 'D01', '---------------------------------------------------------------------------', 'Bla Bla Blub' ]) self.assertEqual(expected_output, str(test_page))
def _get_links_and_forms_and_store(self, url): if Page.query.filter(Page.url == url).first(): return [] r = requests.get(url, cookies=self.cookies) soup = BeautifulSoup(r.text, "html.parser") try: page = Page(self.website_id, url) page.save_to_db() self._get_forms(soup, page.id, url) except Exception as e: print("Row already exists", e) return self._get_links(url, soup)
def getPaging(pageIndex, downloadPhotoFn, tracker): pageObject = Page(URL, pageIndex, PAGE_SIZE) photos = pageObject.fetchPhotos() print("PREPARE: create download task for Page: {0}".format(pageIndex)) with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_DOWNLOAD_WORKER) as downloadExecutor: executor = downloadExecutor #executor = None for photo in photos: if tracker is not None: tracker.addTask(1) if executor is None: downloadPhotoFn(photo, tracker) else: executor.submit(downloadPhotoFn, photo, tracker)
def test_get_product(self): crawler = Crawler() base_path = os.path.abspath('.') + os.sep + 'tests' file_base_path = 'file:///' + base_path link = os.path.join(file_base_path, 'produto_1.html') epoca = EpocaCosmeticos() print epoca.get_product_pages() product = Page(EpocaCosmeticos(), link).get_product() self.assertEquals('Produto 1', product.name) self.assertEquals('Titulo do Produto 1', product.title) self.assertEquals(link, product.url)
def get_article_list(index, type=''): try: index = int(index) except ValueError: index = 1 type = str(type) query = {} if type != '' and type != 'all': tt = article_dao.get_article_type_by_name(str(type)) if tt: query = {'type': str(tt['_id'])} else: return [], Page() item_limit = constants.item_limit * constants.item_limit_rate total_count = article_dao.get_article_count(query) page_count = int(math.ceil(total_count * 1.0 / item_limit)) if index > page_count or index < 1: offset = 0 index = 1 else: offset = (index - 1) * item_limit page = Page() page.index = index page.count = constants.page_count if page_count > constants.page_count else page_count if page.index <= page.count / 2: page.start = 1 else: page.start = page.index - int(page.count / 2) if page.start > page_count - page.count: page.start = page_count - page.count + 1 result = list(article_dao.get_article_list(offset, item_limit, query)) article_list = [] for art in result: model = Article() model._id = art['_id'] model.title = art['title'] model.tag = art['tag'] model.time = art['time'] model.content = art['content'] model.description = art['description'] model.visit_count = art['visit_count'] tt = article_dao.get_article_type_by_id(art['type']) if tt: model.type = tt['_id'] model.type_name = tt['type'] article_list.append(model) return article_list, page
def link_crawler(self, crawlable, max_delay): main_url, product_url_regex = crawlable.get_home_page( ), crawlable.get_product_pages() main_page = Page(crawlable, main_url) main_crawling_pages = main_page.get_main_crawling_pages() crawl_queue = main_crawling_pages[:] current_main_page = 0 main_pages_length = len(main_crawling_pages) all_visited, product_list = [], [] products_visited = [] while crawl_queue: url = crawl_queue.pop() try: if url not in all_visited: if url in main_crawling_pages: current_main_page += 1 print('\n%d out of %d main pages\n' % (current_main_page, main_pages_length)) if max_delay and max_delay > 0: time.sleep( random.randint(0, max_delay) ) #Making a little bit more difficult to be caught page = Page(crawlable, url) all_visited.append(url) if page.is_product and page.url not in products_visited: product_list.append(page.get_product()) products_visited.append(page.url) page_links = page.get_page_links() for link in page_links: if link not in all_visited: crawl_queue.append(link) except Exception as e: traceback.print_exc(file=sys.stdout) if url not in all_visited: all_visited.append(url) print(str(len(product_list)) + ' products found.') return product_list
def parse(self, text, url): dom = self.parseDocument(text) page = Page() page.title = self.get_text_from_element('title') page.content = self.remove_a_tags(self.get_text_from_element('body')) page.url = url def read_link(link): return URLUtils.join_relurl_to_absurl(url, link['href']) page.out_links = [read_link(link) for link in dom.select('a[href]')] page.out_links = ListUtil.to_list_without_duplicated_entries(page.out_links) return page
def GET(self): page = Page() page.curnav = 1 return render.site(page)
def GET(self): page = Page(); page.title = "信息学竞赛培训" page.curnav = 4 return render.noip(page);
def GET(self): page = Page(); page.title = "课程设置" page.curnav = 2 return render.lesson(page);
clock.tick(120) # Menu Components button_dimensions = (100, 40) font_style = "lucidaconsole" title_font = pygame.font.SysFont(font_style, 72) title = title_font.render("BLOCKY", True, (255, 0, 0)) start_button = Button("start_button", (0, 0), button_dimensions, main, "Start", pygame.font.SysFont(font_style, 12)) run_ai_button = Button("run_ai_button", (0, 0), button_dimensions, test_ai, "Run A.I.", pygame.font.SysFont(font_style, 12)) info_button = Button("info_button", (0, 0), button_dimensions, info_page, "Info", pygame.font.SysFont(font_style, 12)) button_list = [start_button, run_ai_button, info_button] menu_page = Page(display, button_list) menu_page.arrange_buttons("vertical", (display_width // 2 - button_dimensions[0] // 2, display_height // 2 - button_dimensions[1] // 2), 60) # Info Components info_image = pygame.image.load('data/control_info.jpg') info_image = pygame.transform.scale(info_image, (display_width // 2, display_height // 2)) back_button = Button( "back_button", ((display_width - button_dimensions[0]) / 2, display_height / 2 + 100), button_dimensions, menu, "Back", pygame.font.SysFont(font_style, 12)) info_page = Page(display, [back_button])
def GET(self): page = Page(); page.title = "关于千知 | 联系我们" page.curnav = 5 return render.about(page);
def GET(self): page = Page(); page.title = "名师介绍" page.curnav = 3 return render.teacher(page);
def show(self): self.site = Site.all().get(); self.categories = Category.all().order('order').get() self.rec = Page.get(self.params.get('id'))
def get_topnews(results=8): URL = "https://ajax.googleapis.com/ajax/services/search/news?v=1.0&ned=%s&topic=%s&rsz=%d" editions = ("es_cl", "en_us") topics = {"w": "Internacional", "h": "Titulares"} i = 0 for edition in editions: for topic in topics: url = URL % (edition, topic, results) print F, url response = urllib2.urlopen(url) data = response.read() news = json.loads(data) if news["responseStatus"] == 200: for result in news["responseData"]["results"]: data = {} data["title"] = result["titleNoFormatting"] data["locale"] = edition data["date"] = result["publishedDate"] data["url"] = result["url"] data["type"] = "news" data["id"] = md5(data["url"]).hexdigest() data["content"] = "" event = {} event["title"] = data["title"] event["locale"] = data["locale"] event["description"] = result["content"] event["date"] = data["date"] e_id = event["id"] = md5("%s %s" % (repr(data["title"]), data["url"])).hexdigest() print F, repr("Crawled news: %s" % data["title"]) e = Event(event) e.save() n = Page(data) n.parent_id = e_id n.save() if result.has_key("relatedStories"): for related in result["relatedStories"]: data = {} data["title"] = related["titleNoFormatting"] data["locale"] = edition data["date"] = related["publishedDate"] data["url"] = related["url"] data["id"] = md5(data["url"]).hexdigest() data["type"] = "news" data["content"] = "" print F, repr("Related news: %s" % data["title"]) n = Page(data) n.parent_id = e_id n.save() i += 1 else: print F, news["responseDetails"] print F, "total news collected: %d" % i
def get_top_news(self): tag = "[crawler/get_top_news]" base_url = settings.GN_BASE_URL editions = settings.GN_EDITIONS topics = settings.GN_TOPICS num_news = settings.GN_NUM_NEWS params = { 'v': '1.0', 'ned': '', 'topic': '', 'rsz': num_news } events = [] pages = [] for edition in editions: for topic in topics: params['ned'] = edition params['topic'] = topic url = base_url + '?' + urlencode(params) print tag, 'getting url', url response = urlopen(url) data = response.read() if data != '': news = json.loads(data) else: return None if news['responseStatus'] != 200: return None for result in news['responseData']['results']: data = {} data['title'] = result['titleNoFormatting'] data['date'] = result['publishedDate'] data['url'] = result['url'] data['type'] = 'news' # quitar la query para tener una unica url par = urlparse.urlparse(unquote(data['url'])) data['url'] = quote(par.scheme + '://' + par.netloc + par.path) event = {} event['title'] = data['title'] event['locale'] = edition event['description'] = result['content'] event['date'] = data['date'] event['url'] = data['url'] event = Event(event) page = Page(data) page.parent_id = event.id events.append(event) pages.append(page) print tag, "event:", data['title'] if not 'relatedStories' in result: continue for related in result['relatedStories']: data = {} data['title'] = related['titleNoFormatting'] data['date'] = related['publishedDate'] data['url'] = related['url'] data['type'] = 'news' # quitar la query para tener una unica url par = urlparse.urlparse(unquote(data['url'])) data['url'] = quote(par.scheme + '://' + par.netloc + par.path) page = Page(data) page.parent_id = event.id pages.append(page) print tag, "page:", data['title'] self.__save_events(events, pages)