class Crawler: def __init__(self, start_url): self.start_url = start_url h = urlparse(start_url) self.host = h[1] self.http = urllib3.PoolManager() self.processing = Processing() def neighbor(self,url): try: response = self.http.request("GET",url) self.html = response.data soup = BeautifulSoup(self.html,"lxml") return soup except: print("could not open url %s " %url) def test(self,url): response = self.http.request("GET",url) html = response.data self.processing.data_processing(html) def bfs(self): url = [] visited = [] url.append(self.start_url) while len(url) > 0: link = url.pop() print(len(url)) uri = urljoin(self.start_url, link) parse_url = urlparse(uri) #f = open(self.host+"\n", 'a+') try: for tag in self.neighbor(link).findAll('a',href=True): tag = urljoin(self.start_url, tag['href']) tag = tag.split('#')[0] junk, ext = os.path.splitext(tag) if tag not in visited and ext != '.jpg' and ext != '.JPG' and ext != '.pdf': visited.append(tag) if parse_url[1] == str(self.host): url.append(tag) self.processing.data_processing(self.html) #f.writelines(tag+"\n") print(tag) except: pass
class Data(): def __init__(self): self.processing = Processing() self.file = pd.read_csv('file.csv') self.header = self.file.columns.tolist() self.option_list = [] self.list = [] def initialize(self): index = len(self.header) print("Choose two options for statistical evaluation: \n") for i in range(0, index): print(i, " = ", self.header[i]) self.selections(index) def selections(self, index): try: choice = int(input()) if choice >= index: print("Choose an element from the list ...") self.selections(index) self.list.append(choice) if len(self.list) < 2: self.selections(index) else: self.controller(self.list) except ValueError: print("Select one of the above options (numeric value): ") self.selections(index) def controller(self, list): for x in list: self.option_list.append(self.header[x]) index1 = self.file[self.option_list[0]] index2 = self.file[self.option_list[1]] self.processing.data_processing(index1, index2, self.option_list)