def scraper(): form = ScraperForm() if request.method == "POST": if form.validate() == False: return render_template('scraper.html', form=form) else: background_task() return "Success!" elif request.method == "GET": return render_template('scraper.html', form=form)
def get_Data(): if 'email' not in session: return redirect(url_for('home')) scraper = ScraperForm() scraper.set_supervisor_choices() form = SupervisorForm() if scraper.is_submitted(): if scraper.validate() == False: flash('To get supervisor data, pick a name') return render_template('create_supervisor.html', scraper=scraper, form=form) else: scraperInfo = Scraper() scraperInfo.getInformation(scraper.supervisorName.data) form.insertScraperInfo(scraperInfo) flash('Information found shown in form below') return render_template('create_supervisor.html', scraper=scraper, form=form) return render_template('create_supervisor.html', scraper=scraper, form=form)
def scraperform(): form = ScraperForm() if request.method == 'GET' and form.validate(): #---Scraper code--- #variables for the scraper # the URL for the first page you want to scrape first_page = form.first_page.data # the URl for the last page you want to scrape (optional) last_page = form.last_page.data # what the next button's text is next_button = form.next_button.data # the name of the manga series as it is in the URLs series_name = form.series_name.data # sometimes the next link only contains the partial URL. this will fill in the full URL URL_beginning = form.URL_beginning.data # where you want to save save_path = form.save_path.data # While there's a next page/chapter, this loop will run. # If there's no 'next chapter' or 'next' button, this loop will break (stop). while first_page and first_page != last_page: # Optional print to see your progress through the chapters. print(first_page) # Requests parses the page (your URL). response = requests.get(first_page) # BeautifulSoup takes the parsed page and stores it in 'soup' as an HTML file. soup = BeautifulSoup(response.text, 'html.parser') # Finds all links that have $next_button and lead to the next page. # href looks for the links that contain $series_name specifically. end_URL = soup.find(string=next_button, href=re.compile(series_name)) # Gets all images from the page. img_tags = soup.find_all('img') # Finds the source URL of the images found. URLs = [img['src'] for img in img_tags] # For each URL in the URLs list for URL in URLs: # Names a file after the URL filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', URL) with open(os.path.join(save_path, filename.group(1)), 'wb') as f: if 'http' not in URL: # Sometimes an image source can be relative. # If it is, this code provides the base URL, which also happens to be the page variable at the moment. URL = '{}{}'.format(first_page, URL) # Parse the URL file (the image source) response = requests.get(URL) # Write URL file to local directory file f.write(response.content) # If there is no next page, this breaks the loop before it returns an error. if end_URL == None: break else: # Changes the URL of the page being read in the loop to the next page. first_page = URL_beginning + end_URL['href'] return render_template('scraperform.html', form=form)