示例#1
0
class NhaDat247(CrawlerObject):

    BASE_URL = "https://nhadat247.com.vn/"
    SAVE_CHECK_POINT = 5

    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nhadat247.com.vn":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = NhaDat247.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html")

        self.key_type = NhaDat247.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")

    def update_crawling_status_info(self, num_post, num_error):
        self.__crawling_info["str_info"] = self.__str_info % (num_post,
                                                              num_error)
        self.db_object.update_wokers_info(Settings.worker_id,
                                          self.__crawling_info)

    def update_crawling_log(self):
        self.db_object.update_wokers_log(Settings.worker_id,
                                         self.__crawling_log["task_id"],
                                         self.__saved_post, self.__failed_urls)

    def get_html_and_soup_from_url(self, url):
        """
        Return Beautifulsoup object
        """
        _soup = None
        _html = None
        for i in range(5):
            try:
                element_present = EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "body > div.footer"))
                _html = self.browser.get_html(url, until_ec=element_present)
                _soup = BeautifulSoup(_html, 'html.parser')
                if _soup is not None:
                    return _html, _soup
            except Exception as e:
                traceback.print_exc()
                continue

        self.__failed_urls.append(self.__current_url)
        return None, None

    @staticmethod
    def get_key_from_type(key) -> list:
        if key == "land":
            return ["ban-dat"]
        elif key == "apartment":
            return ["ban-can-ho-chung-cu"]
        elif key == "house":
            return ["ban-nha-mat-pho", "ban-nha-biet-thu", "ban-nha-rieng"]

        return [
            "ban-dat", "ban-can-ho-chung-cu", "ban-nha-rieng",
            "ban-nha-mat-pho", "ban-nha-biet-thu"
        ]

    def check_type(self, url) -> bool:
        for key in self.key_type:
            if key in url:
                # print("ok")
                return True

        return False

    def append_data(self, _url, _type, _status, _crawl_date, _post_date,
                    _html):

        post = {}

        url_hash = hashlib.md5(_url.encode()).hexdigest()
        post["url_hash"] = url_hash
        post["url"] = _url
        post["type"] = _type
        post["status"] = _status
        post["html"] = _html
        post["date"] = _crawl_date
        post["post_date"] = _post_date
        self.__saved_post.append(url_hash)
        self.buffer.append(post)

        # post["html"] = "<html>"
        # print("-"*10,"\n",post)

    def load_init_url(self) -> tuple:
        local_urls = self.seed_url
        visited_post = []

        if self.resume:
            try:
                local_urls = list(open(self.file_log_new_url, "r").readlines())
            except:
                ""
            try:
                visited_post = list(
                    open(self.file_log_visited_url, "r").readlines())
            except:
                ""

        return local_urls, visited_post

    def get_date(self, page_soup: BeautifulSoup) -> date:
        post_date = None
        try:
            str_date = page_soup.select_one(
                "#ContentPlaceHolder1_ProductDetail1_divprice > div").get_text(
                ).split("|")[1]
            str_date = slugify(str_date.strip().lower())
            if "hom-kia" in str_date:
                post_date = date.today() - timedelta(days=2)
            elif "hom-qua" in str_date:
                post_date = date.today() - timedelta(days=1)
            elif "hom-nay" in str_date:
                post_date = date.today()
            else:

                post_date = datetime.strptime(str_date, '%d-%m-%Y').date()

        except Exception as e:
            self.__failed_urls.append(self.__current_url)
            traceback.print_exc()
        return post_date

    def visit(self, current_url) -> tuple:
        local_urls = []
        post_date = None
        page_source, page_soup = self.get_html_and_soup_from_url(current_url)

        if page_soup:

            is_post = re.search(self.regex_post, current_url)
            if is_post:
                print("Is a post")
                post_date = self.get_date(page_soup)
                if not self.post_date_range or \
                    (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])):
                    post_date = post_date.strftime('%d/%m/%Y')
                else:
                    page_source = None

            else:
                page_source = None

            list_href = page_soup.find_all('a')

            for link in list_href:
                anchor = str(link.get('href'))
                if not bool(urlparse(anchor).netloc):
                    anchor = urljoin(self.BASE_URL, anchor)

                if validators.url(anchor) and self.check_type(anchor) and (
                        self.regex_post.search(anchor)
                        or self.regex_sub_url.search(anchor)):
                    local_urls.append(anchor)

        print("<html>" if page_source else "None")
        return page_source, post_date, local_urls

    def obtain_data(self):

        print("START...")
        num_visited = 0
        local_urls, visited_post = self.load_init_url()
        post_count = len(self.__saved_post)
        while local_urls:
            self.__current_url = local_urls.pop(0)

            if len(self.__current_url) < 10 and (
                    self.__current_url in visited_post
                    or not self.check_type(self.__current_url)):
                continue

            print(" > ", self.__current_url)

            page_source, post_date, new_urls_to_visit = self.visit(
                self.__current_url)

            visited_post.append(self.__current_url)
            local_urls += new_urls_to_visit

            if page_source:
                post_count += 1
                self.append_data(_url=self.__current_url,
                                 _type="post",
                                 _status="0",
                                 _html=page_source,
                                 _crawl_date=str(
                                     date.today().strftime("%d/%m/%Y")),
                                 _post_date=post_date)

            # check-point to save buffer data
            if num_visited % self.SAVE_CHECK_POINT == 0:
                self.save_data()
                self.update_crawling_status_info(post_count,
                                                 len(self.__failed_urls))
                self.update_crawling_log()

                NhaDat247.save_list(local_urls, self.file_log_new_url)
                NhaDat247.save_list(visited_post, self.file_log_visited_url)

            num_visited += 1
            print("  >> num: ", post_count)
            if self.limit > 0 and post_count >= self.limit:
                break

        # finishing
        self.save_data()
        self.update_crawling_status_info(post_count, len(self.__failed_urls))
        self.update_crawling_log()
        self.browser.close()
        print('CRAWLING DONE')

    def rotate_ip(self, enable=False):
        self.browser.set_rotate_ip(enable)
        return

    def save_data(self):
        self.db_object.insert_html_data(self.buffer, many=True)
        # clear buffer
        self.buffer = []

    def get_seed_url(post_type):
        data = {
            "apartment": ["https://nhadat247.com.vn/ban-can-ho-chung-cu.html"],
            "house": [
                "https://nhadat247.com.vn/ban-nha-rieng.html",
                "https://nhadat247.com.vn/ban-nha-biet-thu-lien-ke.html",
                "https://nhadat247.com.vn/ban-nha-mat-pho.html"
            ],
            "land": [
                "https://nhadat247.com.vn/ban-dat-nen-du-an.html",
                "https://nhadat247.com.vn/ban-dat.html"
            ]
        }
        return data[post_type] if post_type in data else [
            url for e in data for url in data[e]
        ]

    def save_list(data: list, file_name):
        print("Checkpoint: ", file_name)
        with open(file_name, 'w') as file:
            file.write("\n".join(set(data)))
            file.close()
示例#2
0
class ChoTotCrawler(CrawlerObject):

    BASE_URL = "https://nha.chotot.com/"
    SAVE_CHECK_POINT = 5

    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nha.chotot.com":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nha.chotot.com, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = ChoTotCrawler.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_chotot_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_chotot_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.htm)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]+)?[/][a-z][-a-z0-9]+/[-a-z0-9]+.htm")

        self.key_type = ChoTotCrawler.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")

    def update_crawling_status_info(self, num_post, num_error):
        self.__crawling_info["str_info"] = self.__str_info % (num_post,
                                                              num_error)
        self.db_object.update_wokers_info(Settings.worker_id,
                                          self.__crawling_info)

    def update_crawling_log(self):
        self.db_object.update_wokers_log(Settings.worker_id,
                                         self.__crawling_log["task_id"],
                                         self.__saved_post, self.__failed_urls)

    def get_html_and_soup_from_url(self, url):
        """
        Return Beautifulsoup object
        """
        _soup = None
        _html = None
        click_phone_script = """
            function getElementByXpath(path) {
                return document.evaluate(path, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
            }

            var phone = getElementByXpath("//*[@id='__next']/div[3]/div[1]/div/div[4]/div[3]/div/linkcontact");
            if (phone != null) {
                phone.click();
            }                    
        """

        for i in range(5):
            try:
                is_post = re.search(self.regex_post, url)
                element_present = EC.presence_of_element_located(
                    (By.XPATH, """//html/body/div[1]/footer"""))
                _html = self.browser.get_html(
                    url=url,
                    until_ec=element_present,
                    run_script=click_phone_script if is_post else None)
                _soup = BeautifulSoup(_html, 'html.parser')
                if _soup is not None:
                    return _html, _soup
            except Exception as e:
                traceback.print_exc()
                continue

        self.__failed_urls.append(self.__current_url)
        return None, None

    @staticmethod
    def get_key_from_type(key) -> list:
        if key == "land":
            return ["mua-ban-dat"]
        elif key == "apartment":
            return ["mua-ban-can-ho-chung-cu"]
        elif key == "house":
            return ["mua-ban-nha-dat"]

        return ["mua-ban-dat", "mua-ban-nha-dat", "mua-ban-can-ho-chung-cu"]

    def check_type(self, url) -> bool:
        for key in self.key_type:
            if key in url:
                # print("ok")
                return True

        return False

    def append_data(self, _url, _type, _status, _crawl_date, _post_date,
                    _html):

        post = {}

        url_hash = hashlib.md5(_url.encode()).hexdigest()
        post["url_hash"] = url_hash
        post["url"] = _url
        post["type"] = _type
        post["status"] = _status
        post["html"] = _html
        post["date"] = _crawl_date
        post["post_date"] = _post_date
        self.__saved_post.append(url_hash)
        self.buffer.append(post)

        # post["html"] = "<html>"
        # print("-"*10,"\n",post)

    def load_init_url(self) -> tuple:
        local_urls = self.seed_url
        visited_post = []

        if self.resume:
            try:
                local_urls = list(open(self.file_log_new_url, "r").readlines())
            except:
                ""
            try:
                visited_post = list(
                    open(self.file_log_visited_url, "r").readlines())
            except:
                ""

        return local_urls, visited_post

    def convert_str2date(date_str):
        _date = None

        date_str = slugify(date_str.lower())
        _l = date_str.split("-")
        if "hom-qua" in date_str:
            _date = date.today() - timedelta(days=1)
        elif "thang" in _l:
            _n = int(_l[_l.index("thang") - 1][0])
            _date = date.today() - timedelta(days=30 * _n)
        elif "tuan" in _l:
            _n = int(_l[_l.index("tuan") - 1][0])
            _date = date.today() - timedelta(days=7 * _n)
        elif "ngay" in _l:
            _n = int(_l[_l.index("ngay") - 1][0])
            _date = date.today() - timedelta(days=1)
        elif "hom-nay" in date_str or "gio" in _l or "phut" in _l:
            _date = date.today()
        else:
            _date = datetime.strptime(date_str, '%d/%m/%Y').date()

        return _date

    def get_date(self, page_soup: BeautifulSoup) -> date:
        post_date = None
        try:
            str_date = page_soup.select_one(
                "#__next > div > div.ct-detail.adview > div > div.col-md-8 > div.adImageWrapper___KTd-h > div.imageCaption___cMU2J > span"
            ).get_text()
            str_date = str_date.strip()
            post_date = ChoTotCrawler.convert_str2date(str_date)

        except Exception as e:
            self.__failed_urls.append(self.__current_url)
            traceback.print_exc()
        return post_date

    def visit(self, current_url) -> tuple:
        local_urls = []
        post_date = None
        page_source, page_soup = self.get_html_and_soup_from_url(current_url)

        if page_soup:

            is_post = re.search(self.regex_post, current_url)
            if is_post:
                print("Is a post")
                post_date = self.get_date(page_soup)
                if not self.post_date_range or \
                    (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])):
                    post_date = post_date.strftime('%d/%m/%Y')
                else:
                    page_source = None

            else:
                page_source = None

            list_href = page_soup.find_all('a')

            for link in list_href:
                anchor = str(link.get('href'))
                if not bool(urlparse(anchor).netloc):
                    anchor = urljoin(self.BASE_URL, anchor)

                if validators.url(anchor) and self.check_type(anchor) and (
                        self.regex_post.search(anchor)
                        or self.regex_sub_url.search(anchor)):
                    local_urls.append(anchor)

        print("<html>" if page_source else "None")
        return page_source, post_date, local_urls

    def obtain_data(self):

        print("START...")
        num_visited = 0
        local_urls, visited_post = self.load_init_url()
        post_count = len(self.__saved_post)
        while local_urls:
            self.__current_url = local_urls.pop(0)

            if len(self.__current_url) < 10 and (
                    self.__current_url in visited_post
                    or not self.check_type(self.__current_url)):
                continue

            print(" > ", self.__current_url)

            page_source, post_date, new_urls_to_visit = self.visit(
                self.__current_url)

            visited_post.append(self.__current_url)
            local_urls += new_urls_to_visit

            if page_source:
                post_count += 1
                self.append_data(_url=self.__current_url,
                                 _type="post",
                                 _status="0",
                                 _html=page_source,
                                 _crawl_date=str(
                                     date.today().strftime("%d/%m/%Y")),
                                 _post_date=post_date)

            # check-point to save buffer data
            if num_visited % self.SAVE_CHECK_POINT == 0:
                self.save_data()
                self.update_crawling_status_info(post_count,
                                                 len(self.__failed_urls))
                self.update_crawling_log()

                ChoTotCrawler.save_list(local_urls, self.file_log_new_url)
                ChoTotCrawler.save_list(visited_post,
                                        self.file_log_visited_url)

            num_visited += 1
            print("  >> num: ", post_count)
            if self.limit > 0 and post_count >= self.limit:
                break

        # finishing
        self.save_data()
        self.update_crawling_status_info(post_count, len(self.__failed_urls))
        self.update_crawling_log()
        self.browser.close()
        print('CRAWLING DONE')

    def rotate_ip(self, enable=False):
        self.browser.set_rotate_ip(enable)
        return

    def save_data(self):
        self.db_object.insert_html_data(self.buffer, many=True)
        # clear buffer
        self.buffer = []

    def get_seed_url(post_type):
        data = {
            "apartment":
            ["https://nha.chotot.com/toan-quoc/mua-ban-can-ho-chung-cu"],
            "house": ["https://nha.chotot.com/toan-quoc/mua-ban-nha-dat"],
            "land": ["https://nha.chotot.com/toan-quoc/mua-ban-dat"]
        }
        return data[post_type] if post_type in data else [
            url for e in data for url in data[e]
        ]

    def save_list(data: list, file_name):
        print("Checkpoint: ", file_name)
        with open(file_name, 'w') as file:
            file.write("\n".join(set(data)))
            file.close()