예제 #1
0
def scraper():
  form = ScraperForm()

  if request.method == "POST":
    if form.validate() == False:
      return render_template('scraper.html', form=form)
    else:

        background_task()
        
        
    return "Success!"

  elif request.method == "GET":
    return render_template('scraper.html', form=form)
예제 #2
0
파일: routes.py 프로젝트: martinsoender/PIS
def get_Data():

  if 'email' not in session:
    return redirect(url_for('home'))

  scraper = ScraperForm()
  scraper.set_supervisor_choices()
  form = SupervisorForm()

  if scraper.is_submitted():
    if scraper.validate() == False:
      flash('To get supervisor data, pick a name')
      return render_template('create_supervisor.html', scraper=scraper, form=form)
    else:
      scraperInfo = Scraper()
      scraperInfo.getInformation(scraper.supervisorName.data)
      form.insertScraperInfo(scraperInfo)
      flash('Information found shown in form below')
      return render_template('create_supervisor.html', scraper=scraper, form=form)
  return render_template('create_supervisor.html', scraper=scraper, form=form)
예제 #3
0
def scraperform():
    form = ScraperForm()
    if request.method == 'GET' and form.validate():
        #---Scraper code---
        #variables for the scraper
        # the URL for the first page you want to scrape
        first_page = form.first_page.data
        # the URl for the last page you want to scrape (optional)
        last_page = form.last_page.data
        # what the next button's text is
        next_button = form.next_button.data
        # the name of the manga series as it is in the URLs
        series_name = form.series_name.data
        # sometimes the next link only contains the partial URL. this will fill in the full URL
        URL_beginning = form.URL_beginning.data
        # where you want to save
        save_path = form.save_path.data
        # While there's a next page/chapter, this loop will run.
        # If there's no 'next chapter' or 'next' button, this loop will break (stop).
        while first_page and first_page != last_page:

            # Optional print to see your progress through the chapters.
            print(first_page)

            # Requests parses the page (your URL).
            response = requests.get(first_page)

            # BeautifulSoup takes the parsed page and stores it in 'soup' as an HTML file.
            soup = BeautifulSoup(response.text, 'html.parser')

            # Finds all links that have $next_button and lead to the next page.
            # href looks for the links that contain $series_name specifically.
            end_URL = soup.find(string=next_button,
                                href=re.compile(series_name))

            # Gets all images from the page.
            img_tags = soup.find_all('img')

            # Finds the source URL of the images found.
            URLs = [img['src'] for img in img_tags]

            # For each URL in the URLs list
            for URL in URLs:

                # Names a file after the URL
                filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', URL)

                with open(os.path.join(save_path, filename.group(1)),
                          'wb') as f:

                    if 'http' not in URL:
                        #  Sometimes an image source can be relative.
                        # If it is, this code provides the base URL, which also happens to be the page variable at the moment.
                        URL = '{}{}'.format(first_page, URL)

                    # Parse the URL file (the image source)
                    response = requests.get(URL)

                    # Write URL file to local directory file
                    f.write(response.content)

            # If there is no next page, this breaks the loop before it returns an error.
            if end_URL == None:
                break
            else:
                # Changes the URL of the page being read in the loop to the next page.
                first_page = URL_beginning + end_URL['href']
    return render_template('scraperform.html', form=form)