def gather_links(page_url): html_string = '' try: print("urlopen("+page_url+Spider.suffix+")") response = urlopen(page_url+Spider.suffix) #if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") print('page_url = '+page_url) urlElems = page_url.split('/') fileName = Spider.project_name +'/'+urlElems[-1]+'.html' print("save to "+fileName) with open(fileName, 'w') as f: f.write(html_string) #else: # print('Failed to get Content-Type') finder = LinkFinder(Spider.base_url, page_url, Spider.ahref_class) finder.feed(html_string) converter = HTMLToTXTConverter() converter.feed(html_string) fileName = Spider.project_name +'/'+urlElems[-1]+'.txt' print("save to "+fileName) with open(fileName, 'w') as f: f.write(converter.getText()) except: e = sys.exc_info()[0] print(e) print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(page_url): html_string = "" try: response = urlopen(page_url) if "text/html" in response.getheader("content-Type"): zipped_html_bytes = response.read() if Spider.html_gzipped: try: html_bytes = gzip.decompress(zipped_html_bytes) except IOError: Spider.html_gzipped = False html_bytes = zipped_html_bytes else: html_bytes = zipped_html_bytes try: html_string = html_bytes.decode("utf-8") except UnicodeDecodeError: try: html_string = html_bytes.decode("gbk") except Exception as e: print(e) finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(e) print("Error: can not craw page.") return set() response.close() return finder.page_links()
def gather_links(page_url): try: finder = LinkFinder(Spider.base_url, page_url) finder.feed(CustomConnection.URL(page_url)) except: return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_string = response.read().decode('utf-8') finder = LinkFinder(Spider.base_url, Spider.page_url) finder.feed(html_string) except Exception as e: print('Error: can not crawl page| ', e) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print("Error : Can't crawl page") return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page__url): html_string = "" try: response = urlopen(page__url) if response.getheader("Content-Type") == "text/html": html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page__url) finder.feed(html_string) except: print("Error: cannot crawl page") return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-type') == 'text/html; charset=utf-8': html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = requests.get(page_url) if 'text/html' in response.headers['Content-Type']: html_string = str(response.content) finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(e) print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(url): html_string = '' try: response = urlopen(url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, url) finder.feed(html_string) except Exception: print('Error: can not crawl page: ' + url) return set() return finder.page_links()
def gather_links(page_url): html_string = "" try: response = urlopen(page_url) if "text/html" in response.getheader("Content-Type"): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print("Cannot crawl page, Exception: "+ str(e)) return set() return finder.page_links()
def fetch_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) return finder.page_links() except Exception as e: print("Can not crawl page \n") print(str(e)) return set()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() chardit1 = chardet.detect(html_bytes) html_string = html_bytes.decode(chardit1['encoding']) finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) #if response.getheader('Content-Type') == 'text/html': if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error : Cannot Crwal Page') return set() return finder.page_links()
def gather_link(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: Can\'t crawl page') return set() return finder.page_links()
def get_menu_links(self, page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder() finder.feed(html_string) menu_links = finder.get_links() return menu_links except Exception as error: print(str(error)) return ''
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print("Error: can not crawl page") return set() # returning empty set so we can continue return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if (response.getheader('content-type') == 'text/html'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('error: can not crawl page') return set() return finder.page_links()
def gatherLinks(pageUrl): '''Connect to site, take html and convert to string format. Pass data to LinkFinder for parsing.''' htmlString = '' try: response = urlopen(pageUrl) if response.getheader('Content-Type') == 'text/html': htmlBytes = response.read() htmlString = htmlBytes.decode('utf - 8') finder = LinkFinder(Spider.baseUrl, pageUrl) finder.feed(htmlString) except: print('Error: cannot crawl page!!!') return set() return finder.pageLinks()
def gather_links(page_url): html_str = '' try: request = Request(page_url, headers=Spider.headers) response = urlopen(request) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_str = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_str) except: print('Cannot access ' + page_url) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) # To check if the file encountered is an HTML and not a PDF or executable or any other file if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) print('Error: can not craw page') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: err_str = "ERROR: {} -- {}".format(page_url, str(e)) print(err_str) return set(), err_str return finder.page_links(), ""
def gather_link(page_rul): html_string = '' try: response =urlopen(page_url) if response.getheader('content-type'=='text/html'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_rul,Spider.page_url) finder.feed(html_bytes) except: print("error") return set() return finder.page_links
def gather_links(page_url): html_string = '' try: req = Request(page_url,headers={'User-Agent': 'Mozilla/5.0'}) response = urlopen(req) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) # print(type(response.getheader('Content-Type'))) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print("Error : can not gather links from " + page_url) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urllib2.urlopen(page_url) if response.info().getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print ('Error: cannot crawl page. ') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) header = response.getheader('Content-Type') if 'text/html' in header: html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' req = Request(url=page_url, headers=Spider.headers) try: response = urlopen(req) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: an not crawl page: ' + page_url) return set() return finder.page_links()
def gather_links(page_url): html_string='' # anytime we are working with networking type of stuff we always put it in try except stuff try: response = urlopen(page_url) # helps to connect to web page if response.getheader('Content-Type')=='text/html': html_bytes=response.read() html_string=html_bytes.decode('utf-8') finder=LinkFinder(Spider.base_url,page_url) finder.feed(html_string) except: print(" Error !! Cannot crawl page") return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': # storing binary response html_bytes = response.read() # converting binary to string html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: cannot crawl page') return set() return finder.page_links()
def gather_links(page_url): html_str='' try: response=urlopen(page_url) if 'text/html' in response.info().getheader('Content-Type'): html_bytes=response.read() html_string=html_bytes.decode("utf-8") finder=LinkFinder(Spider.base_url) finder.feed(html_string) # 返回爬取的url集合 return finder.get_links(); except: print('Error:can not crawl page.') return set()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) # the below if logic is to account for a missing closing quote in some test data if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: #whenver u do a network connection it's always good to put in try block response = urlopen(page_url) #this gives us the byte data if 'text/html' in response.getheader( 'Content-Type' ): #just checking if its not a pdf file or sth html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urllib.request.build_opener(Spider.proxy_support) urllib.request.install_opener(response) with urllib.request.urlopen(page_url) as response: if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: gcontext = ssl._create_unverified_context() response = urlopen(page_url, context=gcontext) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print('Error: cannot crawl page') print(e) return set() return finder.page_links()
def gather_links( page_url ): #this basically collects all the links from specific url and returns all the links on that page, to crawl_page() back html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def generateLinks(page_url): gathered_links = set() html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(page_url, page_url) finder.feed(html_string) gathered_links = finder.page_links() except Exception as e: print(str(e)) return gathered_links
def gather_links(page_url): html_string = '' # variable to parse the crawled page try: response = urlopen(page_url) #check content type is html/text, do not open pdf etx file if 'text/html' in response.headers.getheader('Content-Type'): html_bytes = response.read() #output is in html bytes html_string = html_bytes.decode( 'utf-8', 'replace') #utf 8 is the encoding format finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader( 'Content-Type' ): #to make sure is actual webpage, not executable etc html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) #pass in html data for parsing except Exception as e: print(str(e)) return set() #empty set return finder.page_links(), Spider.get_loc(page_url)
def gather_link(page_url): #get the html soup2 = "" try: page = requests.get(page_url, headers=HEADERS) soup = BeautifulSoup(page.content, "lxml") soup2 = BeautifulSoup(soup.prettify(), "lxml") #feed the html to the finder finder = LinkFinder(Spider.base_url, page_url) finder.feed(str(soup2)) except Exception as e: print(e) return set() return finder.page_links()
def gather_links(page_url): html_string = '' #Store string which computer returns in bytes try: response = urlopen(page_url) if response.getheader('Content-type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error:Cannot Crawl Page') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0' } response=requests.get(page_url,header) header=response.headers['Content-Type'] if header=='text/html; charset=utf-8': html_string=response.text finder=LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode(encoding='utf-8') elif response.getheader('content-type') == 'text/html;charset=utf-8': html_bytes = response.read() html_string = html_bytes.decode(encoding='utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print('\nException : ' + str(e) + '\n') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: #response = urlopen(page_url) # make sure we are connecting to an actual website #if response.getheader('Content-Type') == 'text/html': # html_bytes = response.read() # html_string = html_bytes.decode('utf-8') html_string = requests.get(page_url).text finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: Cannot crawl page - {}'.format(page_url)) return set() return finder.get_page_links()
def gather_link(page_url): html_string = '' try: response = urlopen(page_url) # convert bytes from the python parsing data to human readable data if response.info()['Content-type']=='text/html' or \ response.info()['content-type'] == 'text/html; charset=utf-8' or \ response.info()['Content-type'] == 'text/html; charset=utf-8' or \ response.info()['Content-type'] == 'text/html; charset=UTF-8': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(PySpider.base_url, page_url) finder.feed(html_string) except: print 'Error: can not crawl page' return set() return finder.page_links()
def gather_link(page_url): html_string = '' finder = '' # goto website, get the byte data convert to string # pass it through to linkfinder, and find all the links html_string = '' try: response = urlopen(page_url) if 'text/html' in response.info().headers: html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_link(page_url): html_string = '' # urlopen returns byte data which we have to turn into a readable string try: response = urlopen(page_url) # make sure it is html data (in case we crawl a pdf file) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-a") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: Cannot crawl page') # Return empty set if we cannot crawl the link return set() return finder.page_links()
class Friends_finder(): def __init__(self, user_name, password): self.user_name = user_name self.password = password self.facebook_url = "https://www.facebook.com/" self.more_clicks = 0 self.existent_people_links = set() self.setup() self.log_in() while 1: self.scroll_down_mannualy() self.gather_links() self.append_links_to_queue() def setup(self): print('Seting up WebDriver') self.driver = webdriver.Firefox() self.driver.get(self.facebook_url) def log_in(self): ready = False while ready == False: ready = True try: self.driver.find_element_by_id("email").send_keys(self.user_name) except: ready = False self.driver.find_element_by_id("pass").send_keys(self.password) self.driver.find_element_by_id("pass").send_keys(Keys.RETURN) sleep(2) try: self.driver.find_element_by_xpath('//*[@id="u_0_2"]') print('Conected') except: print('Unable to conect, Please do it manually') ready = False while ready == False: try: self.driver.find_element_by_xpath('//*[@id="u_0_2"]') ready = True except: pass def scroll_down_mannualy(self): print("please scroll down the page") print("When done, press any key to start gathering links") input() def gather_links(self): print('gathering links, please wait ...') self.link_finder = LinkFinder() self.link_finder.feed(self.driver.page_source) self.gathered_links = self.link_finder.get_links() print( str(len(self.gathered_links)) + ' Links was gathered') def append_links_to_queue(self): print('Apending links and updating the queue file...') self.get_existent_links() self.update_queue() def get_existent_links(self): with open("data/people_to_add.txt", "r") as f: for line in f: self.existent_people_links.add(line.replace('\n', '')) with open("data/errors.txt", "r") as f: for line in f: self.existent_people_links.add(line.replace('\n', '')) with open("data/added_friends.txt", "r") as f: for line in f: self.existent_people_links.add(line.replace('\n', '')) def update_queue(self): self.new_links_added = 0 with open("data/people_to_add.txt", "a") as f: for item in self.gathered_links: if item not in self.existent_people_links: self.new_links_added += 1 f.write(item + '\n') print( str(self.new_links_added) + ' Items were added to the queue file')