Пример #1
0
class Crawler:
    def __init__(self, numOfLayer):
        self.num = numOfLayer
        self.parent = []
        self.children = []
        self.handled = []
        self.Indexer = Indexer()
        self.Processor = Processor()
        self.Porter = PorterStemmer()
        self.db = []

        link = "http://www.cse.ust.hk/"
        self.parent.append(link)

    def handleLink(self, links):
        processedLinks = self.Processor.waiveUnrelatedDomain(links)
        processedLinks = self.Processor.clearSubfix(processedLinks)
        processedLinks = self.Processor.clearUnwantedFiles(processedLinks)
        processedLinks = self.Processor.changeUrl(processedLinks)

        return self.Processor.clearDuplicate(processedLinks)

    def getOnePage(self):
        parent = self.parent.pop(0)

        if parent not in self.handled:
            print ""
            print "Searching {}".format(parent)
            try:
                request = requests.get(parent, timeout=20)
                # check if the page can be connected successfully
                if request.status_code == requests.codes.ok:
                    soup = BeautifulSoup(request.text, 'html.parser')
                    # get all child links from the site
                    children = []
                    for link in soup.findAll('a', href=True):
                        children.append(urljoin(parent, link.get('href')))

                    children = self.handleLink(children)
                    for child in children:
                        try:
                            mynewstring = child.encode('ascii')
                            print mynewstring
                        except UnicodeEncodeError:
                            print("there are non-ascii characters in there")
                        self.children.append(child)

                    # words exist in database?
                    if parent not in self.db:
                        # get raw text and split it
                        rawtags = soup.find_all('p')
                        temp = []
                        for tag in rawtags:
                            temp = temp + tag.getText().split()
                        # replace punctuation by white space and split
                        words = []
                        for word in temp:
                            rawtext = word.encode('utf-8').strip()
                            rawtext = "".join(i for i in rawtext
                                              if ord(i) < 128)
                            for c in string.punctuation:
                                rawtext = rawtext.replace(c, " ")
                            words += rawtext.split()
                        # process the words
                        processedWords = []
                        for word in words:
                            processedWords.append(self.Porter.stem(word))
                        # give the data to indexer
                        if len(processedWords) != 0:
                            self.Indexer.process(parent, processedWords,
                                                 children)
                        else:
                            print "The document contains no word"

            except requests.exceptions.ConnectionError:
                print "Error in connecting the site."
            except requests.exceptions.Timeout:
                print "Timeout in connecting the site."

        self.handled.append(parent)

    # search by BFS
    def scrape(self):
        all = db.getAll()
        for instance in all:
            self.db.append(instance['url'])
        print len(self.handled)

        for i in range(self.num):
            self.parent = self.handleLink(self.parent)
            print ""
            print "Searching layer {}".format(i)

            if (len(self.parent) == 0):
                break

            for i in range(len(self.parent)):
                self.getOnePage()

            self.parent = self.children
            self.children = []