Пример #1
0
    def crawl(self, URL, depth=0):

        if depth <= P.crawlerProp.depth:
            print("DEPTH >> ", depth)
            print("VISITING URL >> ", URL)

            try:
                requester = Requester(URL)
                HTML = requester.getHtml()
                parser = Parser(HTML)
                links = parser.getTag('a')

                #print(HTML)
                words = F.extractWords(
                    parser.getTags(list(P.crawlerProp.atlas.keys())),
                    P.crawlerProp.pos_tag)

                print(words)

                depth += 1

                for link in links:
                    if link is not None:
                        if Tag(link).hasKey('href'):
                            nURL = link['href'] if F.urlValid(
                                link['href']) else (
                                    F.urlFix(URL, link['href']) if F.urlValid(
                                        F.urlFix(URL, link['href'])) else None)

                            if self.__pass(nURL):
                                self.visited.append(nURL)
                                #print(nURL)
                                self.crawl(nURL, depth)
                            #else:
                            #print("SKIPPING URL NOT VALID >> ",  nURL)
            except:
                print(Exception())
        else:
            print("REACHED DEPTH LIMIT FOR >> ", URL)