示例#1
0
class Crawler:
    def __init__(self, start_url):
        self.start_url = start_url
        h = urlparse(start_url)
        self.host = h[1]
        self.http = urllib3.PoolManager()
        self.processing = Processing()

    def neighbor(self,url):
            try:
               response = self.http.request("GET",url)
               self.html = response.data
               soup = BeautifulSoup(self.html,"lxml")
               return soup
            except:
                print("could not open url %s "  %url)

    def test(self,url):

        response = self.http.request("GET",url)
        html = response.data
        self.processing.data_processing(html)


    def bfs(self):
        url = []
        visited = []
        url.append(self.start_url)

        while len(url) > 0:
            link = url.pop()
            print(len(url))

            uri = urljoin(self.start_url, link)
            parse_url = urlparse(uri)
            #f = open(self.host+"\n", 'a+')

            try:
                for tag in self.neighbor(link).findAll('a',href=True):
                    tag = urljoin(self.start_url, tag['href'])
                    tag = tag.split('#')[0]
                    junk, ext = os.path.splitext(tag)
                    if tag not in visited and ext != '.jpg' and ext != '.JPG' and ext != '.pdf':
                        visited.append(tag)
                        if parse_url[1]  == str(self.host):
                              url.append(tag)
                              self.processing.data_processing(self.html)
                              #f.writelines(tag+"\n")
                              print(tag)
            except:
                pass
class Data():
    def __init__(self):
        self.processing = Processing()
        self.file = pd.read_csv('file.csv')
        self.header = self.file.columns.tolist()
        self.option_list = []
        self.list = []

    def initialize(self):
        index = len(self.header)
        print("Choose two options for statistical evaluation: \n")
        for i in range(0, index):
            print(i, " = ", self.header[i])
        self.selections(index)

    def selections(self, index):
        try:
            choice = int(input())
            if choice >= index:
                print("Choose an element from the list ...")
                self.selections(index)
            self.list.append(choice)
            if len(self.list) < 2:
                self.selections(index)
            else:
                self.controller(self.list)
        except ValueError:
            print("Select one of the above options (numeric value): ")
            self.selections(index)

    def controller(self, list):
        for x in list:
            self.option_list.append(self.header[x])

        index1 = self.file[self.option_list[0]]
        index2 = self.file[self.option_list[1]]

        self.processing.data_processing(index1, index2, self.option_list)