예제 #1
0
    def gather_links(page_url):
        html_string = ''
        if 'com/photos' in page_url:
            try:
                datalist = []
                urllist = []

                response = urlopen(page_url)
                if 'text/html' in response.getheader('Content-Type'):
                    html_bytes = response.read()
                    html_string = html_bytes.decode('utf-8')

                    content = html_string.split()

                    for i in content:
                        if i.startswith('src="'):
                            datalist.append(i)

                print("\n\n")
                print(datalist)
                print("\n\n")

                for num in datalist:
                    if num not in urllist:
                        urllist.append(num)

                print("\n\n")
                print(urllist)
                print("\n\n")

                for i in urllist:
                    if 'images.unsplash.com' in i:
                        download(i)

                finder = LinkFinder(Spider.base_url, page_url)
                finder.feed(html_string)

            except Exception as e:
                print(str(e))
                return set()
        else:
            try:
                response = urlopen(page_url)
                if 'text/html' in response.getheader('Content-Type'):
                    html_bytes = response.read()
                    html_string = html_bytes.decode('utf-8')
                finder = LinkFinder(Spider.base_url, page_url)
                finder.feed(html_string)
            except Exception as e:
                print(str(e))
                return set()
        return finder.page_links()
예제 #2
0
 def gather_links(page_url):
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type').split(
                 '/')[0] in Spider.bannedResponses:
             raise Exception("Invalid Response")
         if response.getheader('Content-Type').split(
                 '/')[1] in Spider.bannedResponses:
             raise Exception("Invalid Response")
         if response.getheader('Content-Type').split(';')[0] == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
             soup = BeautifulSoup(html_string, 'html.parser')
             title = soup.find('title')
             title = title.text
             finder = LinkFinder(Spider.base_url, page_url)
             finder.handle_starttag(html_string)
             aTextItem = textItem(parse.urljoin(Spider.base_url, page_url),
                                  finder.return_url_text(html_string))
             Spider.textDict.update({Spider.dictCount: aTextItem})
             Spider.titleDict.update({page_url: title})
             Spider.dictCount += 1
             return finder.page_links()
     except Exception as ex:
         print(ex)
         print('Error : can not crawled page ' +
               parse.urljoin(Spider.base_url, page_url))
         if page_url not in Spider.crawled:
             Spider.countOfNotCrawledPgages += 1
         return set()
 def gather_links(page_url):
     # Create a variable/object to store HTML request's response
     html_string = ''
     # Enclose in a try-except block to handle exceptions during connections
     try:
         # Get the response after trying to connect to a webpage
         response = urlopen(page_url)
         # Check if response contains, text/html as Content-Type in header
         if 'text/html' in response.getheader('Content-Type'):
             # Read the response byte wise
             html_bytes = response.read()
             # Decode the response from byte order to human readable format
             # And store in variable/object created earlier to store response
             html_string = html_bytes.decode('utf-8')
         # Create a LinkFinder() object to start parsing webpages
         finder = LinkFinder(Spider.base_url, page_url)
         # Start parsing webpages using HTMLParser class's feed function
         finder.feed(html_string)
     # Catch exception
     except Exception as e:
         # Print exception info to console
         print(str(e))
         # Since exception occured, return empty set() object
         return set()
     # If all operations are successful, return results
     return finder.page_links()
예제 #4
0
    def gather_links(page_url):
        html_string = ""
        try:
            response = urlopen(page_url)

            if "text/html" in response.getheader("content-Type"):
                zipped_html_bytes = response.read()
                if Spider.html_gzipped:
                    try:
                        html_bytes = gzip.decompress(zipped_html_bytes)
                    except IOError:
                        Spider.html_gzipped = False
                        html_bytes = zipped_html_bytes
                else:
                    html_bytes = zipped_html_bytes
                try:
                    html_string = html_bytes.decode("utf-8")
                except UnicodeDecodeError:
                    try:
                        html_string = html_bytes.decode("gbk")
                    except Exception as e:
                        print(e)
            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception as e:
            print(e)
            print("Error: can not craw page.")
            return set()
        response.close()
        return finder.page_links()
예제 #5
0
    def gather_links(page_url):
        """
        connects to site
        takes the html converts from html bytes to proper readable string
        passes to LinkFinder, LinkFinder parses throught and get all the links of the url.
        if theres no error then return., else it will return an empty set with the message
        "error: cannot crawl page!"
        """
        html_string = ''
        #using error catching on networking
        try:
            response = urlopen(page_url)

            #make sure its a html page and not some pdf format
            if 'text/html' in response.getheader('Content-Type'):
                #python read in html bytes format
                html_bytes = response.read()
                #convert into human readable character (utf-8)
                html_string = html_bytes.decode('utf-8')
                #create a linkfinder object
            finder = LinkFinder(Spider.base_url, page_url)
            #feed in the html strings
            finder.feed(html_string)
        except:
            print('Error: cannot crawl page!')
            return set()
        return finder.page_links()
예제 #6
0
 def gather_links(page_url):
     html_string = Spider.connect(page_url)
     if html_string is None:
         return set()                                        # if there is an error return an empty set
     finder = LinkFinder(Spider.base_url, page_url)
     finder.feed(html_string)                                # pass in html data
     return finder.page_links()                              # if there is no error return the set of page links
예제 #7
0
 def gather_links(page_url):
     try:
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(CustomConnection.URL(page_url))
     except:
         return set()
     return finder.page_links()
예제 #8
0
 def gather_links(page_url):
     html_string = ''
     try:
         responce = requests.get(page_url)
         html_string = responce.content
         soup = BeautifulSoup(html_string)
         finder = LinkFinder(soup)
예제 #9
0
    def gather_links_and_text(page_url):
        if page_url in Spider.crawled:
            Spider.queue.remove(page_url)
            print("***************************** Duplicate found!!!!!!!!!!!!!!!!")
            return set()

        else:
            html_string = ''
            try:
                article = Article(page_url, language='bn')
                article.download()
                article.parse()

                html_string = article.html
                Spider.news += article.title + '\n' + article.text

                Spider.page_count += 1
                file = codecs.open(Spider.html_pages + randomString(8) + '.html', "a", "utf-8")
                file.write(html_string)
                file.close()

                if Spider.page_count % 100 == 0:
                    with codecs.open(Spider.project_name + '/all_texts.txt', "a", "utf-8") as w:
                        for l in Spider.news:
                            w.write(l)
                    Spider.news = ""

                # find the links
                finder = LinkFinder(Spider.base_url, page_url)
                finder.feed(html_string)

            except Exception as e:
                print(str(e))
                return set()
            return finder.page_links()
예제 #10
0
 def crawl_page(thread_name, page_url):
     if (page_url not in (Spider.crawled | Spider.finish)):
         finder = LinkFinder(Spider.base_url, page_url)
         Spider.add_links_to_queue_finish(finder.links())
         Spider.queue.remove(page_url)
         Spider.crawled.add(page_url)
         Spider.update_files()
예제 #11
0
    def gather_links(page_url):
        html_string = ''
        try:
            request = urllib2.Request(page_url)
            response = urllib2.urlopen(request)
            # response = urllib2.urlopen(page_url)
            u = response.info().getheader('Content-Type')
            print u
            # print Spider.Type

            if u.find(Spider.Type) != -1:
                vv = page_url
                if vv not in Spider.downloaded:
                    download_file('./' + Spider.projectname, vv)
                    Spider.downloaded.add(vv)

            if u.startswith('text/html'):
                if Spider.Type == "image":
                    Spider.get_images(page_url)
                html_bytes = response.read()
                html_string = html_bytes.decode('utf-8')

                # print "here"

            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception, err:
            # print('Error: cannot crawl page')
            print Exception, err
            return set()
예제 #12
0
	def gather_links(page_url):
		html_string = ''
		returnlinks = set()
		

		try:	
			user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
			headers = {'Connection' : 'keep-alive', 'User-Agent': user_agent,}
			request=Request(page_url,None,headers) #The assembled request
			response = urlopen(request)
			returnheader = response.getheader('Content-Type')	
			html_bytes = response.read()
			




			if 'text/html' in returnheader:
			
				html_string = html_bytes.decode("utf-8")
				finder = LinkFinder(Spider.base_url, page_url)
				finder.feed(html_string)
				foundlinks = finder.page_links()
				#returnlinks = foundlinks
				returnlinks = Spider.cull(foundlinks, page_url, response) 
			
			response.close()
		
		except URLError:
			print('error encountered, most likely a 404\n')
			return set()			
		return returnlinks 
예제 #13
0
	def boot():
        create_project_dir(Spider.project_name)
        create_data_files(Spider.project_name, Spider.base_url)
        Spider.queue = file_to_set(Spider.queue_file)
        Spider.crawled = file_to_set(Spider.crawled_file)
	
# Updates user display, fills queue and updates files
	@staticmethod
	def crawl_page(thread_name, page_url):
        if page_url notin Spider.crawled:
            print(thread_name +' now crawling '+ page_url)
            print('Queue '+str(len(Spider.queue)) +' | Crawled  '+str(len(Spider.crawled)))
            Spider.add_links_to_queue(Spider.gather_links(page_url))
            Spider.queue.remove(page_url)
            Spider.crawled.add(page_url)
            Spider.update_files()
	

	# Converts raw response data into readable information and checks for proper html formatting
	@staticmethod
	def gather_links(page_url):
	        html_string =''
	try:
	            response = urlopen(page_url)
	if'text/html'in response.getheader('Content-Type'):
	                html_bytes = response.read()
	                html_string = html_bytes.decode("utf-8")
	            finder = LinkFinder(Spider.base_url, page_url)
	            finder.feed(html_string)
def ElementFinder(url):
    logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
    logging.info('Running element finder')
    driver = webdriver.Chrome(executable_path='chromedriver.exe')
    links_to_be_checked = LinkFinder(url)
    test_pages = []
    #for link in links_to_be_checked[3:4]:
    for link in links_to_be_checked:
        logging.info('Checking URL ' + link)
        driver.get(link)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        page_elems_list = []
        for id, a in enumerate(soup.find_all()):
            elems = a.attrs
            elems['element_id'] = id
            elems['element_name'] = a.name
            elems['element_text'] = a.text
            elems['page_link'] = link
            elems['element_points_title'] = 0
            elems['element_points_content'] = 0
            elems['element_index'] = 0
            page_elems_list.append(elems)

        test_pages.append(page_elems_list)

    return test_pages
예제 #15
0
	def gather_links(page_url):
		html_string = ''
		try:
			#Some websites dislike being browsed by programs, this ensure to overpass that problem by pretending
			# this is a normal user
			user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
			values = {'name' : 'Rosa Foord',
			          'location' : 'Lyon',
			          'language' : 'Python' }
			headers = { 'User-Agent' : user_agent }

			data  = urllib.parse.urlencode(values)
			data = data.encode('utf-8')
			req = urllib.request.Request(page_url, data, headers)
			response = urllib.request.urlopen(req)
			if 'text/html' in response.getheader('Content-Type'):
				html_bytes = response.read()
				html_string = html_bytes.decode('utf-8')
			finder = LinkFinder(Spider.base_url, page_url)
			finder.feed(html_string)
			#dataRetriever = DataRetriever(Spider.base_url, page_url)
			#dataRetriever.feed(html_string)
		except Exception as e:
			print(str(e))
			return set()
		return finder.page_links()
예제 #16
0
    def gather_links(thread_name, page_url):
        data = {}
        try:
            user_agent_list = [
                # Chrome
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
                'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
                # Firefox
                'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
                'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
                'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
                'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
                'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
            ]
            headers = {'User-Agent': random.choice(user_agent_list)}

            response = requests.get(page_url, headers=headers)

            if 'text/html' in response.headers['Content-Type']:
                soup = BeautifulSoup(response.text, 'lxml')
                title = soup.find('title').text
                keyword = soup.find('meta', attrs={'name':
                                                   'keywords'})['content']
                description = soup.find('meta',
                                        attrs={'name':
                                               'description'})['content']
                data = {
                    'title': title,
                    'meta_keywords': keyword,
                    'meta_description': description,
                    'page_url': page_url
                }
                print("Data fetched from {} : \n".format(thread_name), data)
                finder = LinkFinder(Spider.base_url, page_url)
                finder.feed(response.text)
                Spider.send_data_to_es(thread_name, data)
                del data
            else:
                return set()
        except Exception as e:
            print(str(e))
            return set()
        return finder.page_links()
예제 #17
0
 def gather_links(html_string, page_url):
     try:
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
예제 #18
0
 def gather_links(page_url):
     try:
         response = urlopen(page_url)
         html_bytes = response.read()
         html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('Error')
         return set()
     return finder.page_links()
예제 #19
0
 def __init__(self, crawl_queue: Queue, seen_urls: set, processed_urls: set, rank_queue: Queue
              , depth_limit, domain_name, crawl_queue_time_out: int, logger):
     self.logger = logger
     self.crawl_queue = crawl_queue
     self.seen_urls = seen_urls
     self.processed_urls = processed_urls
     self.rank_queue = rank_queue
     self.depth_limit = depth_limit
     self.domain_name = domain_name
     self.link_finder = LinkFinder()
     self.crawl_queue_time_out = crawl_queue_time_out
예제 #20
0
파일: spider.py 프로젝트: LeonDuan/Crawler
 def gather_links(page_url):
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('Error: cannot crawl page ' + page_url)
         return set()
     return finder.page_links()
예제 #21
0
파일: Spider.py 프로젝트: harry363/Crawler
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         return set()
     return finder.page_links()
예제 #22
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = requests.get(page_url)
         if 'text/html' in response.headers['Content-Type']:
             html_string = response.text
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
예제 #23
0
파일: spider.py 프로젝트: suman8843/crawler
 def gather_links(thread_name, page_url):
     try:
         response = urlopen(page_url)
         html_bytes = response.read()
         html_decompressed = zlib.decompress(html_bytes,
                                             16 + zlib.MAX_WBITS)
         html_string = html_decompressed.decode('utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         try:
             response = urlopen(page_url)
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
             #print(html_string)
             finder = LinkFinder(Spider.base_url, page_url)
             finder.feed(html_string)
         except:
             print(thread_name + ' cannot crawl ' + page_url)
             return set()
     return finder.page_links()
예제 #24
0
	def gatherLinks(pageUrl):
		htmlText = ''
		returnlinks = set()
		try:	
			request=Request(pageUrl,None,Pagerunner.headers) #The assembled request
			response = urlopen(request)
			returnheader = response.getheader('Content-Type')	
			htmlBytes = response.read()
			Pagerunner.addResponse((pageUrl, response))

			if 'text/html' in returnheader:
			
				htmlText = htmlBytes.decode('utf-8')
				finder = LinkFinder(Pagerunner.startAddress, pageUrl)
				finder.feed(htmlText)
				foundlinks = finder.page_links() 
				returnlinks = foundlinks
				#print(returnlinks)
			

			response.close()

			Pagerunner.visited.add(pageUrl)
		
		except URLError as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		except UnicodeDecodeError as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		except UnicodeEncodeError as e:
			print(str(e) + ' : ' +  pageUrl)			
			returnlinks =  set()

		except ConnectionResetError as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		except ConnectionResetError as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		except IncompleteRead as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		finally:
			Pagerunner.visited.add(pageUrl)

		return returnlinks 
예제 #25
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('Error: can not crawl page')
         return set()  #IT NEEDS SOMETHING RETURNED, SO WE JUST RETURN EMPTY
     return finder.page_links()
예제 #26
0
파일: spider.py 프로젝트: alikoptan/spidey
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             tml_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('ERROR: CAN NOT CRAWL, WEBSITE COULD BE UNREACHABLE')
         return set()
     return finder.page_links()
예제 #27
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.info().gettype() == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except IOError, e:
         print('Error: cannot crawl page')
         print(e)
         return list()
예제 #28
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)  #Fetch the page
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
             finder = LinkFinder(Spider.base_url, page_url)
             finder.feed(html_string)
             return finder.page_links()
     except:
         print('Error: Can not scrawl page')
         return set()
 def gather_links(page_url):
     html_string = ""
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
         finder = LinkFinder(Spider.base_url,page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
예제 #30
0
 def gather_links(page_url):
     html_string = ""
     try:
         response = urlopen(page_url)
         if response.getheader("Content-Type") == "text/html":
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print("Error: unable to crawl page")
         return set()
     return finder.page_links()