Exemplo n.º 1
0
 def cleanup_proxy_list(cls, proxy_list):
     lines = [line.strip() for line in proxy_list]
     return list({
         add_http_if_no_scheme(url)
         for url in lines
         if url and not url.startswith('#')
     })
Exemplo n.º 2
0
    def run(self, args, opts):
        url = args[0] if args else None
        if url:
            url = add_http_if_no_scheme(url)
        spider_loader = self.crawler_process.spider_loader

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spider_loader,
                                              Request(url),
                                              spidercls,
                                              log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url)
Exemplo n.º 3
0
 def test_protocol_relative_complete_url(self):
     self.assertEqual(
         add_http_if_no_scheme(
             '//username:[email protected]:80/some/page/do?a=1&b=2&c=3#frag'
         ),
         'http://*****:*****@www.example.com:80/some/page/do?a=1&b=2&c=3#frag'
     )
Exemplo n.º 4
0
    def load_ninja(self, ninja_key, proxy_list, backoff=None):
        proxyList = []
        if not(ninja_key is None):
            r = requests.get(url='https://scrapy.ninja/get_proxy.php?lic=%s' % ninja_key)
            for i in r.json()['proxies']:
                proxyList.append("http://%s/" % i)

        if not(proxy_list is None):
            for i in proxy_list:
                proxyList.append("http://%s/" % i)

        lines = [line.strip() for line in proxyList]
        proxyList = list({
            add_http_if_no_scheme(url)
            for url in lines
            if url and not url.startswith('#')
        })

        self.proxies = {url: ProxyState() for url in proxyList}
        self.proxies_by_hostport = {
            extract_proxy_hostport(proxy): proxy
            for proxy in self.proxies
        }
        self.unchecked = set(self.proxies.keys())
        self.good = set()
        self.dead = set()

        if backoff is None:
            backoff = exp_backoff_full_jitter
        self.backoff = backoff
Exemplo n.º 5
0
def read_urls(fp):
    """ Read a file with urls, one url per line. """
    for line in fp:
        url = line.strip()
        if not url:
            continue
        if url == 'url':
            continue  # optional header
        yield add_http_if_no_scheme(url)
Exemplo n.º 6
0
 def _get_urls(self, fp):
     for row in fp:
         url = row.strip()
         if not url:
             continue
         if url == 'url':
             continue  # optional header
         url = add_http_if_no_scheme(url)
         yield url
Exemplo n.º 7
0
 def __init__(self, url, search_terms=None, *args, **kwargs):
     if url.startswith('.'):
         with open(url) as f:
             urls = [line.strip() for line in f]
     else:
         urls = [url]
     self.start_urls = [add_http_if_no_scheme(_url) for _url in urls]
     self.search_terms = search_terms
     self._extra_search_terms = None  # lazy-loaded via extra_search_terms
     self._reset_link_extractors()
     self.images_link_extractor = LinkExtractor(
         tags=['img'], attrs=['src'], deny_extensions=[])
     self.state = {}
     # Load headless horseman scripts
     self.lua_source = load_directive('headless_horseman.lua')
     self.js_source = load_directive('headless_horseman.js')
     super().__init__(*args, **kwargs)
Exemplo n.º 8
0
 def __init__(self, url, search_terms=None, *args, **kwargs):
     if url.startswith('.'):
         with open(url) as f:
             urls = [line.strip() for line in f]
     else:
         urls = [url]
     self.start_urls = [add_http_if_no_scheme(_url) for _url in urls]
     self.search_terms = search_terms
     self._extra_search_terms = None  # lazy-loaded via extra_search_terms
     self._reset_link_extractors()
     self.images_link_extractor = LinkExtractor(
         tags=['img'], attrs=['src'], deny_extensions=[])
     self._files_fingerprints = set()
     self.state = {}
     self.use_splash = None  # set up in start_requests
     # Load headless horseman scripts
     self.lua_source = load_directive('headless_horseman.lua')
     self.js_source = load_directive('headless_horseman.js')
     super().__init__(*args, **kwargs)
Exemplo n.º 9
0
 def __init__(self, url, search_terms=None, *args, **kwargs):
     if url.startswith('.') or url.startswith('/'):
         with Path(url).open('rt', encoding='utf8') as f:
             urls = [line.strip() for line in f]
     else:
         urls = [u for u in url.split() if u]
     self.start_urls = [add_http_if_no_scheme(_url) for _url in urls]
     self.search_terms = search_terms
     self._extra_search_terms = None  # lazy-loaded via extra_search_terms
     self._reset_link_extractors()
     self.images_link_extractor = LinkExtractor(
         tags=['img'], attrs=['src'], deny_extensions=[],
         canonicalize=False)
     self.state = {}
     self.use_splash = None  # set up in start_requests
     self._screenshot_dest = None  # type: Path
     # Load headless horseman scripts
     self.lua_source = load_directive('headless_horseman.lua')
     self.js_source = load_directive('headless_horseman.js')
     super().__init__(*args, **kwargs)
Exemplo n.º 10
0
    def run(self, args, opts):
        url = args[0] if args else None
        if url:
            url = add_http_if_no_scheme(url)
        spider_loader = self.crawler_process.spider_loader

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spider_loader, Request(url),
                                              spidercls, log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url)
Exemplo n.º 11
0
 def test_preserve_http_path(self):
     self.assertEqual(add_http_if_no_scheme('http://www.example.com/some/page.html'),
                                            'http://www.example.com/some/page.html')
Exemplo n.º 12
0
 def test_preserve_http_without_subdomain(self):
     self.assertEqual(add_http_if_no_scheme('http://example.com'),
                                            'http://example.com')
Exemplo n.º 13
0
 def test_add_scheme(self):
     self.assertEqual(add_http_if_no_scheme('www.example.com'),
                                            'http://www.example.com')
Exemplo n.º 14
0
 def test_without_subdomain(self):
     self.assertEqual(add_http_if_no_scheme("example.com"), "http://example.com")
Exemplo n.º 15
0
 def test_protocol_relative_query(self):
     self.assertEqual(add_http_if_no_scheme('//www.example.com/do?a=1&b=2&c=3'),
                                            'http://www.example.com/do?a=1&b=2&c=3')
Exemplo n.º 16
0
 def test_protocol_relative_port(self):
     self.assertEqual(add_http_if_no_scheme('//www.example.com:80'),
                                            'http://www.example.com:80')
Exemplo n.º 17
0
 def test_protocol_relative_without_subdomain(self):
     self.assertEqual(add_http_if_no_scheme('//example.com'),
                                            'http://example.com')
Exemplo n.º 18
0
 def test_preserve_http_query(self):
     self.assertEqual(add_http_if_no_scheme('http://www.example.com/do?a=1&b=2&c=3'),
                                            'http://www.example.com/do?a=1&b=2&c=3')
Exemplo n.º 19
0
 def test_preserve_http_port(self):
     self.assertEqual(add_http_if_no_scheme('http://www.example.com:80'),
                                            'http://www.example.com:80')
Exemplo n.º 20
0
 def test_path(self):
     self.assertEqual(
         add_http_if_no_scheme("www.example.com/some/page.html"), "http://www.example.com/some/page.html"
     )
Exemplo n.º 21
0
 def test_protocol_relative(self):
     self.assertEqual(add_http_if_no_scheme("//www.example.com"), "http://www.example.com")
Exemplo n.º 22
0
 def test_query(self):
     self.assertEqual(
         add_http_if_no_scheme("www.example.com/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3"
     )
Exemplo n.º 23
0
 def test_fragment(self):
     self.assertEqual(
         add_http_if_no_scheme("www.example.com/some/page#frag"), "http://www.example.com/some/page#frag"
     )
Exemplo n.º 24
0
 def test_port(self):
     self.assertEqual(add_http_if_no_scheme("www.example.com:80"), "http://www.example.com:80")
Exemplo n.º 25
0
 def test_preserve_http_fragment(self):
     self.assertEqual(add_http_if_no_scheme('http://www.example.com/some/page#frag'),
                                            'http://www.example.com/some/page#frag')
Exemplo n.º 26
0
    def __init__(self,
                 url,
                 page_config_file,
                 search_terms=None,
                 *args,
                 **kwargs):
        if url.startswith('.') or url.startswith('/'):
            with Path(url).open('rt', encoding='utf8') as f:
                urls = [line.strip() for line in f]
        else:
            urls = [u for u in url.split() if u]
        print('search terms - ' + str(search_terms))
        self.start_urls = [add_http_if_no_scheme(_url) for _url in urls]
        self.search_terms = search_terms
        self._extra_search_terms = None  # lazy-loaded via extra_search_terms
        self._reset_link_extractors()
        self.images_link_extractor = LinkExtractor(tags=['img'],
                                                   attrs=['src'],
                                                   deny_extensions=[],
                                                   canonicalize=False)
        self.state = {}
        self.use_splash = None  # set up in start_requests
        self._screenshot_dest = None  # type: Path
        # Load headless horseman scripts
        self.lua_source = load_directive('headless_horseman.lua')
        self.js_source = load_directive('headless_horseman.js')

        self.forms_info = list()
        forms_info_str = list()

        #        print('file path --- ' + forms_input) - added to argument list

        #        with Path('/Users/neha/projects/openwatch/forms-info.txt').open('r', encoding='utf8') as f:
        #          for line in f:
        #            line = line.rstrip('\n')
        #            print('LINE -- ' + line)
        #            forms_info_str.append(line)

        #'/users/neha/projects/openwatch/page-config.json'
        if page_config_file:
            with Path(page_config_file).open('r', encoding='utf8') as f:
                self.pages_data = json.load(f)
                self.pages_cfg = self.pages_data['pagesInfo']
                print(self.pages_cfg)
        else:
            self.pages_cfg = []
            print('no page_config_file specified')

#        num_forms = len(forms_info_str)//4
#        if num_forms > 0:
#          for x in range(num_forms):
#            print('form idx - ' + str(x))
#            line_idx = x*4
#            form_url = forms_info_str[line_idx]
#            form_param = json.loads(forms_info_str[line_idx + 1])
#            form_data = json.loads(forms_info_str[line_idx + 2])
#            form_method = forms_info_str[line_idx + 3]
#            form = (form_url, form_param, form_data, form_method)
#            self.forms_info.append(form)
#          print('WHOLE LIST -- ' + json.dumps(self.forms_info))

        super().__init__(*args, **kwargs)
Exemplo n.º 27
0
 def test_preserve_http_username_password(self):
     self.assertEqual(add_http_if_no_scheme('http://*****:*****@www.example.com'),
                                            'http://*****:*****@www.example.com')
Exemplo n.º 28
0
 def test_preserve_http_complete_url(self):
     self.assertEqual(
         add_http_if_no_scheme("http://*****:*****@www.example.com:80/some/page/do?a=1&b=2&c=3#frag"),
         "http://*****:*****@www.example.com:80/some/page/do?a=1&b=2&c=3#frag",
     )
Exemplo n.º 29
0
 def test_protocol_relative_path(self):
     self.assertEqual(add_http_if_no_scheme('//www.example.com/some/page.html'),
                                            'http://www.example.com/some/page.html')
Exemplo n.º 30
0
 def test_protocol_relative_complete_url(self):
     self.assertEqual(add_http_if_no_scheme('//username:[email protected]:80/some/page/do?a=1&b=2&c=3#frag'),
                                            'http://*****:*****@www.example.com:80/some/page/do?a=1&b=2&c=3#frag')
Exemplo n.º 31
0
 def test_protocol_relative_fragment(self):
     self.assertEqual(add_http_if_no_scheme('//www.example.com/some/page#frag'),
                                            'http://www.example.com/some/page#frag')
Exemplo n.º 32
0
 def test_username_password(self):
     self.assertEqual(
         add_http_if_no_scheme("username:[email protected]"), "http://*****:*****@www.example.com"
     )
Exemplo n.º 33
0
 def test_protocol_relative_username_password(self):
     self.assertEqual(add_http_if_no_scheme('//username:[email protected]'),
                                            'http://*****:*****@www.example.com')
Exemplo n.º 34
0
 def test_preserve_https(self):
     self.assertEqual(add_http_if_no_scheme("https://www.example.com"), "https://www.example.com")
Exemplo n.º 35
0
 def test_preserve_ftp(self):
     self.assertEqual(add_http_if_no_scheme('ftp://www.example.com'),
                                            'ftp://www.example.com')
Exemplo n.º 36
0
 def rss(v):
     return add_http_if_no_scheme(unquote(v))