Exemplo n.º 1
0
def _run_copy(project_url, dest_path):

    save_website(
        url=project_url,
        project_folder=dest_path,
        bypass_robots=True,
    )
Exemplo n.º 2
0
def downloadWebPage(url, dirDownload):
    """
    Downloads whole webpage from the specified url.
        url: The base url to download.
        dirDownload: Directory path where the webpage should be downloaded to.
    """

    beginTime = datetime.now()
    kwargs = {
        "bypass_robots": True,
        "project_name": "recognisable-name",
        "load_css": False,
        "load_images": False,
        "load_javascript": False
    }
    try:
        save_website(url=url, project_folder=dirDownload, **kwargs)
    except:
        print("Downloading webpage from '{0}' failed".format(url))

    print("Total run time taken by script: {0}".format(datetime.now() -
                                                       beginTime))
Exemplo n.º 3
0
from pywebcopy import save_website

kwargs = {'project_name': 'website downloaded'}

save_website(url='https://www.tutorialspoint.com', project_folder='', **kwargs)
Exemplo n.º 4
0
                    c = urllib2.urlopen(page)
                except:
                    print("Could not open %s" % page)
                    continue
                soup = BeautifulSoup(c.read())
                links = soup('a')  #finding all the sub_links
                for link in links:
                    if 'href' in dict(link.attrs):
                        url = urljoin(page, link['href'])
                        if url.find("'") != -1:
                            continue
                        url = url.split('#')[0]
                        if url[0:4] == 'http':
                            indexed_url.append(url)
        pages = indexed_url
    return indexed_url


pagelist = ["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
# print( urls )

from pywebcopy import save_website

url = 'http://www.reirab.com/comp551.html'
download_folder = os.getcwd()

kwargs = {'bypass_robots': True, 'project_name': 'PyWebCopy'}

save_website(url, download_folder, **kwargs)
Exemplo n.º 5
0
from pywebcopy import save_website

kwargs = {'project_name': 'wdm'}

save_website(
    url='https://rednoise.org/teaching/wdm/',
    project_folder='./downloads',
    **kwargs
)
Exemplo n.º 6
0
#!pip install pywebcopy

from pywebcopy import save_website

print("Which website you want to mirror?:\n")
_url = input()

print("And where do you want to save it? Insert the path to the folder:\n")
_project_folder = input()

save_website(
	
	url = _url,
	
	project_folder = _project_folder
)

Exemplo n.º 7
0
# Importar save_website de pywebcopy
from pywebcopy import save_website

# Asignar ciento ajustes de descarga 
kwargs = {'project_name':'some-fancy-name'}

# Indicar URL y carpeta de destino 
save_website(
	url='https://hackertyper.net/')
	project_folder="path/to/downloads",
	**kwargs
)
Exemplo n.º 8
0
'''
Copy full Website, Test
'''

from pywebcopy import save_website, save_webpage

save_webpage(
    url='http://example.com/',
    project_folder='webpage/',
)

save_website(
    url='example.com/',
    project_folder='website/',
)
Exemplo n.º 9
0
"""Usando una libreria para clonar sitios web"""

import os
from pywebcopy import save_website

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(BASE_DIR, 'web_clonadas')

kwargs = {'project_name': 'usac'}

save_website(url='https://portal.ingenieria.usac.edu.gt/',
             project_folder=path,
             **kwargs)
Exemplo n.º 10
0
 def pars(self) -> None:
     save_website(url=self.__url,
                  project_folder=self.__path,
                  zip_project_folder=False)
     create_zip(self.__path, self.__id)
Exemplo n.º 11
0
from pywebcopy import save_website
import logging
import os

# get rid of logging
logging.basicConfig()

# download website
website = input('URL for website to be copied: \n')
# set a directory to work in
directory = os.getcwd()

# set project name
name = input('What do you want to name the folder with the information?\n')

# save the webpage (uncomment later, it is already saved)
save_website(url=website, project_folder=directory, project_name=name)

# get a list of the files and append to set
html_files = []
scraped_website_folder = f"{directory}/{name}"
#!/usr/bin/env python3

from pywebcopy import save_website

kwargs = {'project_name': 'webmirrors'}
save_website(
    url='http://help.sumologic.com/',
    project_folder='/var/tmp/',
    **kwargs
)
Exemplo n.º 13
0
servers of the site and rarely could be illegal, so check everything before
you proceed.


choose method and uncomment the method which you like.
'''

# method 1:
'''
pywebcopy.config.setup_config(project_url='http://localhost:5000/', 'project_folder='e://tests/', project_name='LocalHost')
crawler = pywebcopy.Crawler('http://localhost:5000/')
crawler.crawl()
'''

# method 2:
'''
pywebcopy.save_website(page_url, download_folder)
'''
# pywebcopy.save_webpage(page_url, download_folder)
# pywebcopy.WebPage(page_url, html).save_html('e://tests//index.html')
# wp = pywebcopy.webpage.WebPage()
# wp.url = 'http://localhost:5000'
# wp.get('http://google.com/')
# wp.set_source(handle)
# pywebcopy.config.setup_config(wp.url, download_folder, 'LocalHost')
# wp.save_complete()
'''
for thread in threading.enumerate():
    if thread == threading.main_thread():
        continue
    else:
Exemplo n.º 14
0
from pywebcopy import save_website
import globals

save_website(
    url= "http://www.stqc.gov.in",
    project_name="target_website",
    project_folder='./',
)
Exemplo n.º 15
0
from pywebcopy import save_website

dir_name = 'login'
site_url = 'http://brandio.io/envato/iofrm/html/'
kwargs = {'project_name': dir_name}
save_website(url=site_url, project_folder=dir_name, **kwargs)
Exemplo n.º 16
0
        sys.exit(1)

    if len(args) == 2:
        print("Saving {!r} in {!r}".format(args[1], os.getcwd()))
        save_webpage(args[1], os.getcwd())

    elif len(args) == 4 and args[2] == '-d':
        print("Saving {!r} in {!r}".format(args[1], args[3]))
        save_webpage(args[1], args[3])

    else:
        print_usage()
        sys.exit(1)

elif args[0] == '-c':
    if len(args) < 2:
        print_usage()
        sys.exit(1)

    if len(args) == 2:
        print("Saving {!r} in {!r}".format(args[1], os.getcwd()))
        save_website(args[1], os.getcwd())

    elif len(args) == 4 and args[2] == '-d':
        print("Saving {!r} in {!r}".format(args[1], args[3]))
        save_website(args[1], args[3])

    else:
        print_usage()
        sys.exit(1)
Exemplo n.º 17
0
    if args.location and not isinstance(args.location, six.string_types):
        parser.error("--location option requires 1 string type argument")
    if args.name and not isinstance(args.name, six.string_types):
        parser.error("--name option requires 1 string type argument")

if args.page:
    save_webpage(
        url=args.url,
        project_folder=args.location,
        bypass_robots=args.bypass_robots,
        open_in_browser=args.pop,
        debug=not args.quite,
        delay=args.delay,
        threaded=args.threaded,
    )
elif args.site:
    save_website(
        url=args.url,
        project_folder=args.location,
        bypass_robots=args.bypass_robots,
        open_in_browser=args.pop,
        debug=not args.quite,
        delay=args.delay,
        threaded=args.threaded,
    )
elif args.tests:
    os.system('%s -m unittest discover -s pywebcopy/tests' % sys.executable)
else:
    parser.print_help()
    sys.exit(1)