def parse(self, url): links = "" content = "" for i in range(0, 3): try: page = Page(url, self.base_url, self.url_set) links, content = page.parse() break except RequestException as e: self.log.add_error(e) except Exception as e: self.log.add_error(e) break return (links, content)
def read_page(self, link, links): if link['link'] not in self.worked_links: self.worked_links.append(link['link']) else: return res = urljoin(self.site, link['link']) print(res) links += self.get_page_links(res) self.session.add(Page(website=self.site, url=res, title=link['title'], description=link['desc'], ads=0, SSL=True, multi_lang=None, points=None)) self.session.commit() links = links[1:]
def test(self): mark_value = 5 marks = [mark_value] photos = self.upload_photo(USERNAME_SECOND) self.photos = photos name = self.get_name(USERNAME_FIRST) self.set_marks(None, photos, marks) self.login(USERNAME_SECOND) page = Page(self.driver) events_modal = page.top_menu.events_modal events_modal.open() self.assertTrue(events_modal.check_mark(name, marks[0]))
def web_requests(webURL): #try to request the page try: page = requests.get(webURL) except requests.ConnectionError as e: print(e) sys.exit(1) except requests.Timeout: print('request timed out\nExiting..') sys.exit(1) #if successful, parse the page soup = BeautifulSoup(page.content, 'html.parser') # get page attributes to create a new page object title = soup.find_all('title')[0].get_text() linksFull = soup.find_all('a') links = list() for link in linksFull: linkSrc = link.get('href') if internal_website(webURL, linkSrc): absoluteURL = relative_to_absolute(webURL, linkSrc) links.append(absoluteURL) paragraphListFull = soup.find_all('p') paragraphList = list() words = dict() titleSplit = title.split() for titleWord in titleSplit: scrubbed = scrub_string(titleWord) if scrubbed in words: words[scrubbed] += 1 else: words[scrubbed] = 1 # fill words dictionary for p in paragraphListFull: paragraph = p.get_text() paragraphList.append(paragraph) wordsList = paragraph.split() for word in wordsList: scrubbed = scrub_string(word) if scrubbed in words: words[scrubbed] += 1 else: words[scrubbed] = 1 # create page object p = Page(title, links, words, paragraphList) return p
def __init__(self, total_memory_size, page_size): # define the attributes self.total_memory_size = total_memory_size self.page_size = page_size self.memory_tracker = {} self.total_number_of_pages = math.floor(self.total_memory_size / self.page_size) for i in range(self.total_number_of_pages): self.list_of_all_pages.append(Page(self.page_size)) self.pages_free = SpaceBinaryTree(self.total_number_of_pages) logger.debug( "Number of pages available in memory %s of size %s bytes." % (len(self.list_of_all_pages), self.page_size))
def get_robots_delay(domain_url: str): """Get delay from "robots.txt" of `domain_url` Args: `domain_url`: URL, e.g. "ya.ru" Returns: floating point number - required delay in seconds. Defaults to 1 second. """ real_domain_url = Page(domain_url).domain_url() robots_parser = RobotsProvider._get_robots_parser(real_domain_url) delay = robots_parser.crawl_delay(URLGetter.USERAGENT) if delay is None: return 1 else: return delay
def manage_comments(): user = g.get('user', None) if user is None: return redirect(url_for('login')) else: if user.admin: item_count = queryNumById(Comments) page_arg = request.args.get('page', '1') page_index = getPageStr(page_arg) page = Page(item_count, page_index) comments_list = queryAllDesc(Comments, page.offset, page.limit) return render_template('manage_comments.html', comments=comments_list, page=page, user=user) else: return redirect(url_for('login'))
def Append(self, panel_or_page): ''' Note: should only be called by UberBook and TabManager Creates a page from the panel provided if not already a page Adds that page to the container Returns that page to the parent notebook for tab generation ''' if isinstance(panel_or_page, Page): page = panel_or_page page.Reparent(self) else: page = Page(self, panel_or_page) page.Size = self.Size return page
def start_robot(): reload(sys) sys.setdefaultencoding('utf-8') patterns = get_lines_from_file('sloth/config/data_patterns.txt') start_pages_data = get_lines_from_file('sloth/config/start_pages.txt') pages = [Page(page_data[0], int(page_data[1]), {'category':page_data[2]}) for page_data in start_pages_data] pool = Pool(pages) for r in range(5): robo = robot.Robot(pool, patterns) try: robo.start() except: print ki robo.kill()
def __init__(self, handle): ''' Initialize the toolbars and the reading board ''' super(LetterMatch, self).__init__(handle) self.datapath = get_path(activity, 'instance') self.image_id = None self.audio_id = None if 'LANG' in os.environ: language = os.environ['LANG'][0:2] elif 'LANGUAGE' in os.environ: language = os.environ['LANGUAGE'][0:2] else: language = 'es' # default to Spanish # FIXME: find some reasonable default situation language = 'es' self.letter = None self.activity_path = activity.get_bundle_path() self._lessons_path = os.path.join(self.activity_path, 'lessons', language) self._images_path = os.path.join(self.activity_path, 'images', language) self._sounds_path = os.path.join(self.activity_path, 'sounds', language) self.data_from_journal = {} if 'data_from_journal' in self.metadata: self.data_from_journal = json.loads( str(self.metadata['data_from_journal'])) self._setup_toolbars() self.canvas = Gtk.DrawingArea() self.canvas.set_size_request(Gdk.Screen.width(), Gdk.Screen.height()) self.canvas.modify_bg(Gtk.StateType.NORMAL, Gdk.color_parse("#000000")) self.canvas.show() self.set_canvas(self.canvas) self.mode = 'letter' self._page = Page(self.canvas, self._lessons_path, self._images_path, self._sounds_path, parent=self)
def index(): page = Page('GSearch!') graph = Graph_Dao() categories = graph.getCategories() print(categories) page.addScript("dbQueries.js", "gsearch") page.addScript("ajax.js") page.addScript("testAjax.js", "gsearch") page.addBlockCenter(blocks.testForm()) page.addBlockCenter( blocks.test_paragraph({ 'page': page, 'name': "Super Name!", 'id': page.next_id() })) page.addBlockLeft( blocks.categoriesButtonList({ 'categories': categories, 'page': page })) page.addBlockLeft( blocks.test_paragraph({ 'page': page, 'name': "Super Name, le 2e!", 'id': page.next_id() })) page.addBlockLeft(vue_blocks.vue_form({ 'page': page, 'id': page.next_id() })) page.addBlockRight(blocks.button("Delete graph", "btn-danger", "buttonID")) page.addBlockRight( blocks.button("Create GSearch graph", "btn-primary", "gsearchButton")) page.addBlockRight( blocks.test_paragraph({ 'page': page, 'name': "Super Name, le 3e!", 'id': page.next_id() })) return page.render()
def get(self): page = Page() content = ContentPage() ship = Ship() finder = PageFinder() ship_class = ShipClass() self.header = ship.header current_view = '' ship_data = [] # if there has been a request if self.request.GET: new_view = self.request.GET['id'] # try to turn the new_view variable into an integer try: ship_data = ship_class.ship_class_list[int(new_view)] # Define local variables/ self.header = ship_data.header self.armaments = ship_data.armaments self.defenses = ship_data.defenses self.decks = ship_data.decks self.crew = ship_data.crew self.cruise_speed = ship_data.cruise_speed self.max_cruise_speed = ship_data.max_cruise_speed self.maximum_speed = ship_data.maximum_speed self.img_url = ship_data.img_url self.img_description = ship_data.img_description self.description = ship_data.description #if the new_view is not an integer except: # define ship data as default data ship_data = ship # Insert local variables # Define the current view from the new_view value current_view = finder.get_content_page(new_view) # current_view = current_view.format(**locals()) else: # Grab main page concatenation current_view = page.print_main_page() # Insert local variables current_view = current_view.format(**locals()) # Print page self.response.write(current_view)
def main(argv): """ INIT THE SPEECH RECOGNITION """ print("A moment of silence, please...") with m as source: r.adjust_for_ambient_noise(source) r.energy_threshold = 100 print("Set minimum energy threshold to {}".format(r.energy_threshold)) """ INIT THE JOB """ if (os.path.isfile('jobs/TGP.conf')): with open('jobs/TGP.conf', 'rb') as f: p = pickle.load(f) printer.safe = True printer.init() job = Job(p['name'], Job.get_processor(p['processor']), p['lang']) job.processor.set_properties(p['processor_properties']) """ DO A PAGE """ page = Page(printer.capture(), printer.getOcr(), job.get_language()) if config.DEBUGGING: gui.setOriginal(page.getImageOriginal()) gui.setProcessed(page.getImageProcessed()) text = [Box(w) for w in job.processor.get_text(page)] """ SPAWN A PRINTING THREAD """ def threaded_print(): while True: if len(instructions) > 0: instruction = instructions.pop(0) printer.plot(instruction) if config.DEBUGGING: gui.plot(instruction) else: time.sleep(1) continue listener_thread = threading.Thread(target=threaded_print) listener_thread.daemon = True listener_thread.start() """ START LISTENING """ i = 0 while True: print("LISTENING ({0})").format(i) with m as source: audio = r.record(source, duration=10) print("ANALYZING ({0})").format(i) process(audio, text, i) i += 1
def get(self): page = Page( ) # creates an instance of the Page function which is defined at library.py mike = Person(5555) # Password for his/her voicemail mike.name = "Mike Taatgen" # Name of the user mike.text = 45 #Amount of text send mike.minutes = 300 # Amount of minutes talked on the phone mike.internet = 2 # Amount of GB used for data anthony = Person(5235) # Password for his/her voicemail anthony.name = "Anthony Kluba" # Name of the user anthony.text = 25 #Amount of text send anthony.minutes = 325 # Amount of minutes talked on the phone anthony.internet = 7.2 # Amount of GB used for data nate = Person(1821) # Password for his/her voicemail nate.name = "Nathan Dickison" # Name of the user nate.text = 75 #Amount of text send nate.minutes = 290 # Amount of minutes talked on the phone nate.internet = 3.2 # Amount of GB used for data jairo = Person(8371) # Password for his/her voicemail jairo.name = "Jairo Jurado" # Name of the user jairo.text = 25 #Amount of text send jairo.minutes = 400 # Amount of minutes talked on the phone jairo.internet = 6 # Amount of GB used for data rebecca = Person(9213) # Password for his/her voicemail rebecca.name = "Rebecca Carroll" # Name of the user rebecca.text = 49 #Amount of text send rebecca.minutes = 280 # Amount of minutes talked on the phone rebecca.internet = 4 # Amount of GB used for data players = [mike, anthony, nate, jairo, rebecca] #array with all the players self.response.write(page.header()) # Creates the HTML attributes self.response.write(page.form()) if self.request.GET: player = int( self.request.GET['person']) - 1 #because index start at 0 self.response.write(self.html(players[player])) self.response.write(page.footer())
def read(self): if not self._robot.can_view('/'): raise ConnectionError("You are not allowed to scrape this site.") page = Page() request = requests.get(self._site) page.set_request(request) page.set_html(request.text) soup = BeautifulSoup(page.get_html(), 'html.parser') for link in (item['href'] for item in soup.find_all('a', href=True) if item['href'] and item['href'] != "#" and item['href'] != '/'): if self._robot.can_view(link) and link not in page.get_links(): if link[0] == "/": link = f"{self._site}{link}" page.add_link(link) self.add_page(page) return page
def get(self): i = ItemList() #Set our item list object #Check if there was a Get request if self.request.GET: p = ContentPage() #if so, make our p variable a ContentPage object id = self.request.GET[ 'id'] #Get the id of the item we're looking at p.recipe = i.recipe_display( id) #use id to get recipe display and send to content page p.content = i.item_display( id) #use id to get item display and send to content page else: p = Page( ) #if there is no get request, make our p variable a Page object p.content = i.display() #set our content to our item list display #Display whatever we have self.response.write(p.print_out())
def __read_pages(self): """This function is used to read the pages info from the files into the system """ path = os.path.join(self.cwd,'data/pages') available_pages = os.listdir(path) if len(available_pages)>0: for page_id in available_pages: if page_id == 'README.md': continue with open(os.path.join(path,page_id),'r') as file: page_data = json.load(file) page = Page(name = page_data['name'], icon= None,owner = page_data['owner'],id =page_data['id']) if page_data['timeline'] == False: page.init_admins(page_data['admins']) page.init_followers(page_data['followers']) else: page.set_as_timeline() page.init_posts(page_data['posts']) self.pages[page_id.split('.')[0]]=page
def buildPageDict(): ''' builds a dictionary of 23andMe content pages. pagesFN comes from these two gdocs: - https://docs.google.com/spreadsheets/d/1mf86slweZEKUd5hzG2GmKGTGIpHuDipJz2u221y2zVE/edit?ts=568eb997#gid=0 - https://docs.google.com/spreadsheets/d/1oo0sRmYFNeWikuOxcb_1obOoO35wQccmOzyGRmqDMtc/edit?ts=578578d0#gid=362797346 ''' utils.checkFileExistence(Node.config.pagesFN, 'Content pages') with open(Node.config.pagesFN, 'r') as pagesFile: pagesFile.readline() # header for line in pagesFile: yccOld, snpName = line.strip().split() page = Page(yccOld, snpName) Node.pageList.append(page) if yccOld == Node.config.rootHaplogroup: Node.pageDict[Node.config.rootHaplogroup] = page elif snpName != '.': Node.pageDict[snpName] = page
def load_pages(root_path, element_types): background_path = get_image_path(root_path, pre_fix=["background"]) element_path = get_image_path(root_path, pre_fix=element_types) pages = [] for p in background_path: pages.append(Page(p)) elements = {} for p in element_path: ele = Element(p) if ele.page_name not in elements: elements[ele.page_name] = {} if ele.element_type not in elements[ele.page_name]: elements[ele.page_name][ele.element_type] = [] elements[ele.page_name][ele.element_type].append(ele) for p in pages: p.load_elements(elements.get(p.page_name)) return pages
def scrape_url(self, url): """Scrapes a single URL.""" print("Scraping {}".format(url)) page = Page(url) self.pages[url] = page response = requests.get(url) if response.status_code != requests.codes.ok: return # NOTE: html.parser engine is slower than lxml bs = BeautifulSoup(response.text, 'lxml') parsed_url = urlparse(url) # Find links. self.find_links(page, bs, parsed_url) # Find assets. for tag_name, attr in self.asset_tags: for tag in bs.find_all(tag_name): link = tag.get(attr) # script src might be empty if it is inlined. if link: page.assets.add(link)
def process_url(self, url_to_process, url_depth): page = Page(url_to_process, client=self.client) self.website_structure[page.normalized_url] = { "url": page.normalized_url, "depth": url_depth, "status code": page.status_code, "h1": page.h1, "title": page.title, "time to response": page.response_time, "images": "\n".join(page.images), "h2 titles": "\n".join(page.get_titles(2)), "h3 titles": "\n".join(page.get_titles(3)), "h4 titles": "\n".join(page.get_titles(4)), "h5 titles": "\n".join(page.get_titles(5)), "h6 titles": "\n".join(page.get_titles(6)), } for link in page.links: logging.debug(f"Inner link: {link}") if link not in self.website_structure: self.urls_queue.append((link, url_depth + 1))
def get_pages_content(reachable_servers: list, config, pages: list): """ 爬蟲多執行緒部分的邏輯 """ print("=========== Start crawling all pages ===========") queue = Queue() threads = [] for thread in range(NUM_THREADS): new_thread = Thread(target=crwaling, args=(thread, queue)) new_thread.setDaemon(True) new_thread.start() threads.append(new_thread) for server in reachable_servers: page = Page(config=config, page_type=server.get('TYPE'), ip=server.get('IP')) queue.put({'server': server, 'page': page, 'pages': pages}) queue.join() print("================= End of crawling =================")
def build(self, docs_path): dist_path = self.config.general.dist_path self.prepare_dist(dist_path) pages = [] for child in docs_path.iterdir(): if child.is_dir(): item = Section(self.config, child) else: if child.suffix != ".md": continue item = Page(self.config, child) pages.append(item) if self.config.general.enable_api_reference: pages.append(ApiReferencePage(self.config)) sitemap = [p.to_nav_dict() for p in pages] for page in pages: page.save_html(dist_path, sitemap)
def process_response(robot, page, page_response): pages = [] text = '' content = BeautifulSoup(page_response.content).findAll( search_comp, attrs={'class': search_class}) if content is not None: for item in content: text += item.text anchors = BeautifulSoup(page_response.text).findAll('a') for anchor in anchors[:40]: if anchor.has_key('href'): href = anchor['href'] if is_same_host(page.url, href_prefix + href) is True: pages.append( Page(href_prefix + href, page.depth, {'action': 'default'})) file = open(output, 'a+') file.writelines(text) file.close() return [], pages
def __init__(self, handle): ''' Initialize the toolbars and the reading board ''' super(AEIOU, self).__init__(handle) if 'LANG' in os.environ: language = os.environ['LANG'][0:2] elif 'LANGUAGE' in os.environ: language = os.environ['LANGUAGE'][0:2] else: language = 'es' # default to Spanish # FIXME: find some reasonable default situation language = 'es' self.activity_path = activity.get_bundle_path() self._lessons_path = os.path.join(self.activity_path, 'lessons', language) self._images_path = os.path.join(self.activity_path, 'images', language) self._sounds_path = os.path.join(self.activity_path, 'sounds', language) self._setup_toolbars() # Create a canvas canvas = Gtk.DrawingArea() width = Gdk.Screen.width() height = int(Gdk.Screen.height()) canvas.set_size_request(width, height) canvas.modify_bg(Gtk.StateType.NORMAL, Gdk.color_parse("#000000")) canvas.show() self.set_canvas(canvas) self.mode = 'letter' self._page = Page(canvas, self._lessons_path, self._images_path, self._sounds_path, parent=self)
def parse(self): j = 1 while j <= int(self.itr): try: page = Page(self.url + "?itemsPerPage=100&order=asc&page=" + str(j) + "&sortBy=rank") soup = bs.BeautifulSoup(page.html, 'html.parser') center = soup.find('center') table = center.find('table', class_="maintable") table1 = table.find('table', class_="dataTable") username_list = table1.findAll('div', class_="user-name") for i in range(len(username_list)): username_list[i] = username_list[i].find( 'a')['href'].split('/')[-1] self.user_list.extend(username_list) j += 1 print("Page:\t", j - 1, " :: ", "Users:\t", len(self.user_list)) except: print("Network Error from RankListParser.") time.sleep(2)
def test_case_1(browser): page = Page(browser) # Navigate to the URL https://www.sogeti.com/ page.load() # Hover over Services Link page.hover('//*[@id="header"]/div[1]/nav/ul/li[3]/div') # and then Click Automation link. page.click('//*[@id="header"]/div[1]/div[5]/ul/li[7]/a') # On Automation Page, scroll down to the Contact us Form. page.scroll_into_view('//*[@id="99a12a58-3899-4fe1-a5c7-b9065fe635b0"]') # Fill the First Name with Random Generated Data. page.input_to('//*[@id="4ff2ed4d-4861-4914-86eb-87dfa65876d8"]', 'test') # Fill the Last Name with Random Generated Data page.input_to('//*[@id="11ce8b49-5298-491a-aebe-d0900d6f49a7"]', 'test') # Fill the Email with Random Generated Data page.input_to('//*[@id="056d8435-4d06-44f3-896a-d7b0bf4d37b2"]', '*****@*****.**') # Fill the Phone with Random Generated Data page.input_to('//*[@id="755aa064-7be2-432b-b8a2-805b5f4f9384"]', 'test') # Fill the Message with Random Generated Data page.input_to('//*[@id="88459d00-b812-459a-99e4-5dc6eff2aa19"]', 'test') # Check the I agree checkbox. page.click('//*[@id="863a18ee-d748-4591-bb64-ef6eae65910e"]/label/input') # Then Click SUBMIT button. page.click('//*[@id="06838eea-8980-4305-83d0-42236fb4d528"]') # After clicking SUBMIT button the form is submitted and Thank you message is displayed. Assert the Thank you message assert 'Thank you for contacting us.' in page.get_attribute_of_element( '//*[@id="99a12a58-3899-4fe1-a5c7-b9065fe635b0"]/div[1]/div/p', 'textContent')
def showstudents(pageindex): username = request.cookies.get('username') if username is None: return redirect(url_for('login')) session = DBSession() student_count = session.query(Student).count() page_size = 5 page = Page(student_count, int(pageindex), page_size) try: students = session.query(Student).offset(page.offset).limit(page.limit) list = [] for i in students: list.append(i.to_dict()) # list.append(page.__dict__) # return ResponseBody(0, list).getContent() return render_template("showStudent.html", list=list, page=page.__dict__) except Exception as e: logging.info(e) print e return ResponseBody(1, None)() finally: session.close()
def test_search_in_yandex(self): driver = self.driver driver.implicitly_wait(3) logging.info(u'set wait time 3 sec') try: driver.get("https://www.yandex.ru/") logging.info(u'go to yandex.ru') except: logging.error(u'cannot go to yandex.ru') try: assert "Яндекс" in driver.title logging.info(u'it is yandex.ru') except: logging.error(u'it is not yandex.ru') try: search = Page(driver) except: logging.error(u'cannot create object of class') try: search.search_in_yandex('Тензор') logging.info(u'input text') except: logging.error(u"cannot input Тензор") try: search.check_suggest() logging.info(u'suggest is find') except: logging.error(u'suggest not find') try: search.click_on_search() logging.info(u'click is success') except: logging.error(u'cannot find button') try: search.check_link() logging.info(u'link is here') except: logging.error(u'cannot find link') assert "No results found." not in driver.page_source
def debug_load_pages(): background_path = get_image_path("./java_screenhot", pre_fix=["background"]) element_path = get_image_path("./java_screenhot", pre_fix=["control_Button"]) pages = [] for p in background_path: pages.append(Page(p)) elements = {} for p in element_path: ele = Element(p) if ele.page_name not in elements: elements[ele.page_name] = {} if ele.element_type not in elements[ele.page_name]: elements[ele.page_name][ele.element_type] = [] elements[ele.page_name][ele.element_type].append(ele) for p in pages: p.load_elements(elements.get(p.page_name)) return pages