예제 #1
0
파일: crawler.py 프로젝트: yasarb/bil372
    def traverse(self):
        while not self.__URLQueue.empty():
            URL = self.__URLQueue.get()

            self.__p = Parser(source=self.get_source_code(URL), url=URL)
            try:
                try:
                    if (self.__fields["name"] != None and len(self.__fields["name"]) == 0):
                        self.__fields["name"] = self.__p.find_name()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["uni"] != None and len(self.__fields["uni"]) == 0):
                        self.__fields["uni"] = self.__p.find_uniname()
                except Exception as e:
                    pass


                try:
                    if (self.__fields["dept"] != None and len(self.__fields["dept"]) == 0):
                        self.__fields["dept"] = self.__p.find_dept()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["tel"] != None and len(self.__fields["tel"]) == 0):
                        self.__fields["tel"] = self.__p.find_phone()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["email"] != None and len(self.__fields["email"]) == 0):
                        self.__fields["email"] = self.__p.find_email()
                except Exception as e:
                    pass

                try:
                    if ("publica" in URL.lower() or
                            "research" in URL.lower() or
                            "article" in URL.lower() or
                            URL.lower() == self.URL.lower()):

                        if (self.__fields["publication"] != None and len(self.__fields["publication"]) == 0):
                            self.__fields["publication"] = self.__p.find_publication()
                except Exception as e:
                    pass

                try:
                    if ("teach" in URL.lower() or
                            "course" in URL.lower() or
                            "class" in URL.lower() or
                            URL.lower() == self.URL.lower()):

                        if (self.__fields["course"] != None and len(self.__fields["course"]) == 0):
                            self.__fields["course"] = self.__p.find_courses()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["address"] != None and len(self.__fields["address"]) == 0):
                        self.__fields["address"] = self.__p.find_address()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["interest"] != None and len(self.__fields["interest"]) == 0):
                        self.__fields["interest"] = self.__p.find_interest()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["rank"] != None and len(self.__fields["rank"]) == 0):
                        self.__fields["rank"] = self.__p.find_rank()
                except Exception as e:
                    pass

            except Exception as e:
                continue
예제 #2
0
파일: crawler.py 프로젝트: yasarb/bil372
class Crawler:
    def __init__(self, URL):
        self.URL = URL
        self.__URLQueue = queue.Queue()
        self.__p = None
        self.__keywords = {"contact", "research", "biography", "publication", "class"}
        self.__fields = {"name": "", "uni": "", "dept":"", "tel": "",
                         "email": "", "publication": "",
                         "address": "", "course": "", "interest": "", "rank": ""}

    @staticmethod
    def get_source_code(url):
        try:
            user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'
            headers = {'User-Agent': user_agent, }
            request = urllib.request.Request(url, None, headers)
            response = urllib.request.urlopen(request)
            code = response.read()

            # don't care between style tags
            while code.find(b'<style>') != -1:
                s_index = code.find(b'<style>')
                e_index = code.find(b'</style>', s_index)
                code = code[:s_index] + code[e_index + (len('</style>')):]
            # don't care between script tags
            while code.find(b'<script') != -1:
                s_index = code.find(b'<script')
                e_index = code.find(b'</script>', s_index)
                code = code[:s_index] + code[e_index + (len('</script>')):]
            return codecs.decode(code, 'utf-8', 'ignore')
        except requests.exceptions.RequestException as e:
            print(e)

    def get_links(self, source, url):
        from urllib.parse import urlparse, urljoin
        soup = BeautifulSoup(source)
        wholeLinks = set([i["href"] for i in soup.find_all('a', href=True)])
        for link in wholeLinks:
            if any(j in link for j in self.__keywords):
                if (link.startswith("http")):
                    if (urlparse(link).netloc == urlparse(url).netloc):
                        self.__put_link(link)
                else:
                    combinedLink = urljoin(url, link)
                    self.__put_link(combinedLink)

    # put the links existed in home page into URLQueue
    def __put_link(self, url):
        self.__URLQueue.put(url)

    # Get a link from URLQueue and apply parser functions on it
    def traverse(self):
        while not self.__URLQueue.empty():
            URL = self.__URLQueue.get()

            self.__p = Parser(source=self.get_source_code(URL), url=URL)
            try:
                try:
                    if (self.__fields["name"] != None and len(self.__fields["name"]) == 0):
                        self.__fields["name"] = self.__p.find_name()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["uni"] != None and len(self.__fields["uni"]) == 0):
                        self.__fields["uni"] = self.__p.find_uniname()
                except Exception as e:
                    pass


                try:
                    if (self.__fields["dept"] != None and len(self.__fields["dept"]) == 0):
                        self.__fields["dept"] = self.__p.find_dept()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["tel"] != None and len(self.__fields["tel"]) == 0):
                        self.__fields["tel"] = self.__p.find_phone()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["email"] != None and len(self.__fields["email"]) == 0):
                        self.__fields["email"] = self.__p.find_email()
                except Exception as e:
                    pass

                try:
                    if ("publica" in URL.lower() or
                            "research" in URL.lower() or
                            "article" in URL.lower() or
                            URL.lower() == self.URL.lower()):

                        if (self.__fields["publication"] != None and len(self.__fields["publication"]) == 0):
                            self.__fields["publication"] = self.__p.find_publication()
                except Exception as e:
                    pass

                try:
                    if ("teach" in URL.lower() or
                            "course" in URL.lower() or
                            "class" in URL.lower() or
                            URL.lower() == self.URL.lower()):

                        if (self.__fields["course"] != None and len(self.__fields["course"]) == 0):
                            self.__fields["course"] = self.__p.find_courses()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["address"] != None and len(self.__fields["address"]) == 0):
                        self.__fields["address"] = self.__p.find_address()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["interest"] != None and len(self.__fields["interest"]) == 0):
                        self.__fields["interest"] = self.__p.find_interest()
                except Exception as e:
                    pass

                try:
                    if (self.__fields["rank"] != None and len(self.__fields["rank"]) == 0):
                        self.__fields["rank"] = self.__p.find_rank()
                except Exception as e:
                    pass

            except Exception as e:
                continue

    def run(self):
        self.__URLQueue.put(self.URL)
        self.get_links(self.get_source_code(self.URL), self.URL)

        self.traverse()

        return self.__fields